diff options
Diffstat (limited to 'llvm/lib')
320 files changed, 7953 insertions, 4812 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index ec78386..2d52f34 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -929,12 +929,11 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, if (!AllConstantInt) break; - // TODO: Try to intersect two inrange attributes? - if (!InRange) { - InRange = GEP->getInRange(); - if (InRange) - // Adjust inrange by offset until now. - InRange = InRange->sextOrTrunc(BitWidth).subtract(Offset); + // Adjust inrange offset and intersect inrange attributes + if (auto GEPRange = GEP->getInRange()) { + auto AdjustedGEPRange = GEPRange->sextOrTrunc(BitWidth).subtract(Offset); + InRange = + InRange ? InRange->intersectWith(AdjustedGEPRange) : AdjustedGEPRange; } Ptr = cast<Constant>(GEP->getOperand(0)); @@ -1374,7 +1373,7 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst, if (ConstantFP *CFP = dyn_cast<ConstantFP>(Operand)) return flushDenormalConstantFP(CFP, Inst, IsOutput); - if (isa<ConstantAggregateZero, UndefValue, ConstantExpr>(Operand)) + if (isa<ConstantAggregateZero, UndefValue>(Operand)) return Operand; Type *Ty = Operand->getType(); @@ -1390,6 +1389,9 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *Inst, Ty = VecTy->getElementType(); } + if (isa<ConstantExpr>(Operand)) + return Operand; + if (const auto *CV = dyn_cast<ConstantVector>(Operand)) { SmallVector<Constant *, 16> NewElts; for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) { @@ -2629,14 +2631,14 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_ceil_d: return ConstantFoldFP( ceil, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_fabs_ftz: case Intrinsic::nvvm_fabs: return ConstantFoldFP( fabs, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_floor_ftz_f: @@ -2644,7 +2646,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_floor_d: return ConstantFoldFP( floor, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); case Intrinsic::nvvm_rcp_rm_ftz_f: @@ -2706,7 +2708,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return nullptr; return ConstantFoldFP( sqrt, APF, Ty, - nvvm::GetNVVMDenromMode( + nvvm::GetNVVMDenormMode( nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID))); // AMDGCN Intrinsics: diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 2da6468..1959ab6 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -1079,15 +1079,16 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) { // add new space S = &BS->Spaces.emplace_back(B.Space); - // the space is full - set flag to report overlapping binding later - if (S->FreeRanges.empty()) { + // The space is full - there are no free slots left, or the rest of the + // slots are taken by an unbounded array. Set flag to report overlapping + // binding later. + if (S->FreeRanges.empty() || S->FreeRanges.back().UpperBound < UINT32_MAX) { OverlappingBinding = true; continue; } // adjust the last free range lower bound, split it in two, or remove it BindingRange &LastFreeRange = S->FreeRanges.back(); - assert(LastFreeRange.UpperBound == UINT32_MAX); if (LastFreeRange.LowerBound == B.LowerBound) { if (B.UpperBound < UINT32_MAX) LastFreeRange.LowerBound = B.UpperBound + 1; diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index dd9a44b..f1473b2 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -3383,6 +3383,10 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst, SrcSubscripts, DstSubscripts)) return false; + assert(isLoopInvariant(SrcBase, SrcLoop) && + isLoopInvariant(DstBase, DstLoop) && + "Expected SrcBase and DstBase to be loop invariant"); + int Size = SrcSubscripts.size(); LLVM_DEBUG({ dbgs() << "\nSrcSubscripts: "; @@ -3666,6 +3670,19 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, SCEVUnionPredicate(Assume, *SE)); } + // Even if the base pointers are the same, they may not be loop-invariant. It + // could lead to incorrect results, as we're analyzing loop-carried + // dependencies. Src and Dst can be in different loops, so we need to check + // the base pointer is invariant in both loops. + Loop *SrcLoop = LI->getLoopFor(Src->getParent()); + Loop *DstLoop = LI->getLoopFor(Dst->getParent()); + if (!isLoopInvariant(SrcBase, SrcLoop) || + !isLoopInvariant(DstBase, DstLoop)) { + LLVM_DEBUG(dbgs() << "The base pointer is not loop invariant.\n"); + return std::make_unique<Dependence>(Src, Dst, + SCEVUnionPredicate(Assume, *SE)); + } + uint64_t EltSize = SrcLoc.Size.toRaw(); const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase); const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase); diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 82530e7..5907e21 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -5366,7 +5366,7 @@ static Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, Type *MidTy = CI->getType(); Type *DstTy = Ty; if (Src->getType() == Ty) { - auto FirstOp = static_cast<Instruction::CastOps>(CI->getOpcode()); + auto FirstOp = CI->getOpcode(); auto SecondOp = static_cast<Instruction::CastOps>(CastOpc); Type *SrcIntPtrTy = SrcTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(SrcTy) : nullptr; diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 3aa9909..2b0f212 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -983,33 +983,37 @@ MemDepResult MemoryDependenceResults::getNonLocalInfoForBlock( static void SortNonLocalDepInfoCache(MemoryDependenceResults::NonLocalDepInfo &Cache, unsigned NumSortedEntries) { - switch (Cache.size() - NumSortedEntries) { - case 0: - // done, no new entries. - break; - case 2: { - // Two new entries, insert the last one into place. - NonLocalDepEntry Val = Cache.back(); - Cache.pop_back(); - MemoryDependenceResults::NonLocalDepInfo::iterator Entry = - std::upper_bound(Cache.begin(), Cache.end() - 1, Val); - Cache.insert(Entry, Val); - [[fallthrough]]; + + // If only one entry, don't sort. + if (Cache.size() < 2) + return; + + unsigned s = Cache.size() - NumSortedEntries; + + // If the cache is already sorted, don't sort it again. + if (s == 0) + return; + + // If no entry is sorted, sort the whole cache. + if (NumSortedEntries == 0) { + llvm::sort(Cache); + return; } - case 1: - // One new entry, Just insert the new value at the appropriate position. - if (Cache.size() != 1) { + + // If the number of unsorted entires is small and the cache size is big, using + // insertion sort is faster. Here use Log2_32 to quickly choose the sort + // method. + if (s < Log2_32(Cache.size())) { + while (s > 0) { NonLocalDepEntry Val = Cache.back(); Cache.pop_back(); MemoryDependenceResults::NonLocalDepInfo::iterator Entry = - llvm::upper_bound(Cache, Val); + std::upper_bound(Cache.begin(), Cache.end() - s + 1, Val); Cache.insert(Entry, Val); + s--; } - break; - default: - // Added many values, do a full scale sort. + } else { llvm::sort(Cache); - break; } } diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp index e8d4e37..f1c3155 100644 --- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp +++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp @@ -121,8 +121,18 @@ void ProfileSummaryInfo::computeThresholds() { ProfileSummaryBuilder::getHotCountThreshold(DetailedSummary); ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(DetailedSummary); - assert(ColdCountThreshold <= HotCountThreshold && - "Cold count threshold cannot exceed hot count threshold!"); + // When the hot and cold thresholds are identical, we would classify + // a count value as both hot and cold since we are doing an inclusive check + // (see ::is{Hot|Cold}Count(). To avoid this undesirable overlap, ensure the + // thresholds are distinct. + if (HotCountThreshold == ColdCountThreshold) { + if (ColdCountThreshold > 0) + (*ColdCountThreshold)--; + else + (*HotCountThreshold)++; + } + assert(ColdCountThreshold < HotCountThreshold && + "Cold count threshold should be less than hot count threshold!"); if (!hasPartialSampleProfile() || !ScalePartialSampleProfileWorkingSetSize) { HasHugeWorkingSetSize = HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 0990a0d..61a575c 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -2682,6 +2682,21 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, return getAddExpr(NewOps, PreservedFlags); } } + + // Try to push the constant operand into a ZExt: A + zext (-A + B) -> zext + // (B), if trunc (A) + -A + B does not unsigned-wrap. + if (auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Ops[1])) { + const SCEV *B = ZExt->getOperand(0); + const SCEV *NarrowA = getTruncateExpr(A, B->getType()); + if (isa<SCEVAddExpr>(B) && + NarrowA == getNegativeSCEV(cast<SCEVAddExpr>(B)->getOperand(0)) && + getZeroExtendExpr(NarrowA, ZExt->getType()) == A && + hasFlags(StrengthenNoWrapFlags(this, scAddExpr, {NarrowA, B}, + SCEV::FlagAnyWrap), + SCEV::FlagNUW)) { + return getZeroExtendExpr(getAddExpr(NarrowA, B), ZExt->getType()); + } + } } // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y) diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index e475be2..6e92766 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -875,6 +875,34 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_toascii); } + if (T.isOSFreeBSD()) { + TLI.setAvailable(LibFunc_dunder_strtok_r); + TLI.setAvailable(LibFunc_memalign); + TLI.setAvailable(LibFunc_fputc_unlocked); + TLI.setAvailable(LibFunc_fputs_unlocked); + TLI.setAvailable(LibFunc_fread_unlocked); + TLI.setAvailable(LibFunc_fwrite_unlocked); + TLI.setAvailable(LibFunc_getc_unlocked); + TLI.setAvailable(LibFunc_getchar_unlocked); + TLI.setAvailable(LibFunc_putc_unlocked); + TLI.setAvailable(LibFunc_putchar_unlocked); + + TLI.setUnavailable(LibFunc___kmpc_alloc_shared); + TLI.setUnavailable(LibFunc___kmpc_free_shared); + TLI.setUnavailable(LibFunc_dunder_strndup); + TLI.setUnavailable(LibFunc_memccpy_chk); + TLI.setUnavailable(LibFunc_strlen_chk); + TLI.setUnavailable(LibFunc_fmaximum_num); + TLI.setUnavailable(LibFunc_fmaximum_numf); + TLI.setUnavailable(LibFunc_fmaximum_numl); + TLI.setUnavailable(LibFunc_fminimum_num); + TLI.setUnavailable(LibFunc_fminimum_numf); + TLI.setUnavailable(LibFunc_fminimum_numl); + TLI.setUnavailable(LibFunc_roundeven); + TLI.setUnavailable(LibFunc_roundevenf); + TLI.setUnavailable(LibFunc_roundevenl); + } + // As currently implemented in clang, NVPTX code has no standard library to // speak of. Headers provide a standard-ish library implementation, but many // of the signatures are wrong -- for example, many libm functions are not diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp index c871070..7025b83 100644 --- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const { Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } @@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const { Result.TBAA = Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp index 15107c2..2e4063f 100644 --- a/llvm/lib/Analysis/UniformityAnalysis.cpp +++ b/llvm/lib/Analysis/UniformityAnalysis.cpp @@ -178,6 +178,7 @@ bool UniformityInfoWrapperPass::runOnFunction(Function &F) { void UniformityInfoWrapperPass::print(raw_ostream &OS, const Module *) const { OS << "UniformityInfo for function '" << m_function->getName() << "':\n"; + m_uniformityInfo.print(OS); } void UniformityInfoWrapperPass::releaseMemory() { diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 1b3da59..425ea31 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::exp: case Intrinsic::exp10: case Intrinsic::exp2: + case Intrinsic::ldexp: case Intrinsic::log: case Intrinsic::log10: case Intrinsic::log2: @@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::canonicalize: case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::ucmp: @@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( switch (ID) { case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: + case Intrinsic::lround: + case Intrinsic::llround: case Intrinsic::lrint: case Intrinsic::llrint: case Intrinsic::vp_lrint: @@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_is_fpclass: return OpdIdx == 0; case Intrinsic::powi: + case Intrinsic::ldexp: return OpdIdx == -1 || OpdIdx == 1; default: return OpdIdx == -1; @@ -240,30 +246,6 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI, return Intrinsic::not_intrinsic; } -struct InterleaveIntrinsic { - Intrinsic::ID Interleave, Deinterleave; -}; - -static InterleaveIntrinsic InterleaveIntrinsics[] = { - {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2}, - {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3}, - {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4}, - {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5}, - {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6}, - {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7}, - {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8}, -}; - -Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) { - assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); - return InterleaveIntrinsics[Factor - 2].Interleave; -} - -Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) { - assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); - return InterleaveIntrinsics[Factor - 2].Deinterleave; -} - unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) { switch (ID) { case Intrinsic::vector_interleave2: @@ -1141,7 +1123,7 @@ Constant * llvm::createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup<Instruction> &Group) { // All 1's means mask is not needed. - if (Group.getNumMembers() == Group.getFactor()) + if (Group.isFull()) return nullptr; // TODO: support reversed access. @@ -1687,7 +1669,7 @@ void InterleavedAccessInfo::analyzeInterleaving( // Case 1: A full group. Can Skip the checks; For full groups, if the wide // load would wrap around the address space we would do a memory access at // nullptr even without the transformation. - if (Group->getNumMembers() == Group->getFactor()) + if (Group->isFull()) continue; // Case 2: If first and last members of the group don't wrap this implies @@ -1722,7 +1704,7 @@ void InterleavedAccessInfo::analyzeInterleaving( // Case 1: A full group. Can Skip the checks; For full groups, if the wide // store would wrap around the address space we would do a memory access at // nullptr even without the transformation. - if (Group->getNumMembers() == Group->getFactor()) + if (Group->isFull()) continue; // Interleave-store-group with gaps is implemented using masked wide store. diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp index 3b436af..f1765d7 100644 --- a/llvm/lib/BinaryFormat/SFrame.cpp +++ b/llvm/lib/BinaryFormat/SFrame.cpp @@ -35,3 +35,36 @@ ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() { }; return ArrayRef(ABIs); } + +ArrayRef<EnumEntry<sframe::FREType>> sframe::getFRETypes() { + static constexpr EnumEntry<sframe::FREType> FRETypes[] = { +#define HANDLE_SFRAME_FRE_TYPE(CODE, NAME) {#NAME, sframe::FREType::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FRETypes); +} + +ArrayRef<EnumEntry<sframe::FDEType>> sframe::getFDETypes() { + static constexpr EnumEntry<sframe::FDEType> FDETypes[] = { +#define HANDLE_SFRAME_FDE_TYPE(CODE, NAME) {#NAME, sframe::FDEType::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FDETypes); +} + +ArrayRef<EnumEntry<sframe::AArch64PAuthKey>> sframe::getAArch64PAuthKeys() { + static constexpr EnumEntry<sframe::AArch64PAuthKey> AArch64PAuthKeys[] = { +#define HANDLE_SFRAME_AARCH64_PAUTH_KEY(CODE, NAME) \ + {#NAME, sframe::AArch64PAuthKey::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(AArch64PAuthKeys); +} + +ArrayRef<EnumEntry<sframe::FREOffset>> sframe::getFREOffsets() { + static constexpr EnumEntry<sframe::FREOffset> FREOffsets[] = { +#define HANDLE_SFRAME_FRE_OFFSET(CODE, NAME) {#NAME, sframe::FREOffset::NAME}, +#include "llvm/BinaryFormat/SFrameConstants.def" + }; + return ArrayRef(FREOffsets); +} diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp index 5d7c97a..6356d71 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp @@ -37,8 +37,8 @@ void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA, // unsigned long personality; /* Pointer to the personality routine */ // } - auto *EHInfo = - cast<MCSectionXCOFF>(Asm->getObjFileLowering().getCompactUnwindSection()); + auto *EHInfo = static_cast<MCSectionXCOFF *>( + Asm->getObjFileLowering().getCompactUnwindSection()); if (Asm->TM.getFunctionSections()) { // If option -ffunction-sections is on, append the function name to the // name of EH Info Table csect so that each function has its own EH Info diff --git a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp index de6ebcf..51342c6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -39,7 +39,7 @@ void ARMException::beginFunction(const MachineFunction *MF) { if (CFISecType == AsmPrinter::CFISection::Debug) { if (!hasEmittedCFISections) { if (Asm->getModuleCFISectionType() == AsmPrinter::CFISection::Debug) - Asm->OutStreamer->emitCFISections(false, true); + Asm->OutStreamer->emitCFISections(false, true, false); hasEmittedCFISections = true; } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index f1d3e96..6166271 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -4221,10 +4221,11 @@ MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const { SectionKind Kind = CPE.getSectionKind(&DL); const Constant *C = CPE.Val.ConstVal; Align Alignment = CPE.Alignment; - if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( - getObjFileLowering().getSectionForConstant(DL, Kind, C, - Alignment))) { - if (MCSymbol *Sym = S->getCOMDATSymbol()) { + auto *S = + getObjFileLowering().getSectionForConstant(DL, Kind, C, Alignment); + if (S && TM.getTargetTriple().isOSBinFormatCOFF()) { + if (MCSymbol *Sym = + static_cast<const MCSectionCOFF *>(S)->getCOMDATSymbol()) { if (Sym->isUndefined()) OutStreamer->emitSymbolAttribute(Sym, MCSA_Global); return Sym; diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 8abeb56..c5d6e40 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -1051,10 +1051,10 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) { // comdat key. A section may be comdat because of -ffunction-sections or // because it is comdat in the IR. MCSectionCOFF *GVSec = - GVSym ? dyn_cast<MCSectionCOFF>(&GVSym->getSection()) : nullptr; + GVSym ? static_cast<MCSectionCOFF *>(&GVSym->getSection()) : nullptr; const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr; - MCSectionCOFF *DebugSec = cast<MCSectionCOFF>( + auto *DebugSec = static_cast<MCSectionCOFF *>( CompilerInfoAsm->getObjFileLowering().getCOFFDebugSymbolsSection()); DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index 4fac4bb..6b8d08c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -109,9 +109,11 @@ void DwarfCFIException::beginBasicBlockSection(const MachineBasicBlock &MBB) { // chose not to be verbose in that case. And with `ForceDwarfFrameSection`, // we should always emit .debug_frame. if (CFISecType == AsmPrinter::CFISection::Debug || - Asm->TM.Options.ForceDwarfFrameSection) + Asm->TM.Options.ForceDwarfFrameSection || + Asm->TM.Options.MCOptions.EmitSFrameUnwind) Asm->OutStreamer->emitCFISections( - CFISecType == AsmPrinter::CFISection::EH, true); + CFISecType == AsmPrinter::CFISection::EH, true, + Asm->TM.Options.MCOptions.EmitSFrameUnwind); hasEmittedCFISections = true; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 11b8576..7188833 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -972,10 +972,9 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP, // the call graph which could lead to some target function. For tail // calls, no return PC information is needed, unless tuning for GDB in // DWARF4 mode in which case we fake a return PC for compatibility. - const MCSymbol *PCAddr = - (!IsTail || CU.useGNUAnalogForDwarf5Feature()) - ? const_cast<MCSymbol *>(getLabelAfterInsn(TopLevelCallMI)) - : nullptr; + const MCSymbol *PCAddr = (!IsTail || CU.useGNUAnalogForDwarf5Feature()) + ? getLabelAfterInsn(TopLevelCallMI) + : nullptr; // For tail calls, it's necessary to record the address of the branch // instruction so that the debugger can show where the tail call occurred. diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 3b3e7a4..dcfd9aa 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -2083,22 +2083,55 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (TBB == FBB) { MBB->splice(Loc, TBB, TBB->begin(), TIB); } else { + // Merge the debug locations, and hoist and kill the debug instructions from + // both branches. FIXME: We could probably try harder to preserve some debug + // instructions (but at least this isn't producing wrong locations). + MachineInstrBuilder MIRBuilder(*MBB->getParent(), Loc); + auto HoistAndKillDbgInstr = [MBB, Loc](MachineBasicBlock::iterator DI) { + assert(DI->isDebugInstr() && "Expected a debug instruction"); + if (DI->isDebugRef()) { + const TargetInstrInfo *TII = + MBB->getParent()->getSubtarget().getInstrInfo(); + const MCInstrDesc &DBGV = TII->get(TargetOpcode::DBG_VALUE); + DI = BuildMI(*MBB->getParent(), DI->getDebugLoc(), DBGV, false, 0, + DI->getDebugVariable(), DI->getDebugExpression()); + MBB->insert(Loc, &*DI); + return; + } + // Deleting a DBG_PHI results in an undef at the referenced DBG_INSTR_REF. + if (DI->isDebugPHI()) { + DI->eraseFromParent(); + return; + } + // Move DBG_LABELs without modifying them. Set DBG_VALUEs undef. + if (!DI->isDebugLabel()) + DI->setDebugValueUndef(); + DI->moveBefore(&*Loc); + }; + // TIB and FIB point to the end of the regions to hoist/merge in TBB and // FBB. MachineBasicBlock::iterator FE = FIB; MachineBasicBlock::iterator FI = FBB->begin(); for (MachineBasicBlock::iterator TI : make_early_inc_range(make_range(TBB->begin(), TIB))) { - // Move debug instructions and pseudo probes without modifying them. - // FIXME: This is the wrong thing to do for debug locations, which - // should at least be killed (and hoisted from BOTH blocks). - if (TI->isDebugOrPseudoInstr()) { - TI->moveBefore(&*Loc); + // Hoist and kill debug instructions from FBB. After this loop FI points + // to the next non-debug instruction to hoist (checked in assert after the + // TBB debug instruction handling code). + while (FI != FE && FI->isDebugInstr()) + HoistAndKillDbgInstr(FI++); + + // Kill debug instructions before moving. + if (TI->isDebugInstr()) { + HoistAndKillDbgInstr(TI); continue; } - // Get the next non-meta instruction in FBB. - FI = skipDebugInstructionsForward(FI, FE, false); + // FI and TI now point to identical non-debug instructions. + assert(FI != FE && "Unexpected end of FBB range"); + // Pseudo probes are excluded from the range when identifying foldable + // instructions, so we don't expect to see one now. + assert(!TI->isPseudoProbe() && "Unexpected pseudo probe in range"); // NOTE: The loop above checks CheckKillDead but we can't do that here as // it modifies some kill markers after the check. assert(TI->isIdenticalTo(*FI, MachineInstr::CheckDefs) && @@ -2111,6 +2144,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { ++FI; } } + FBB->erase(FBB->begin(), FIB); if (UpdateLiveIns) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index c21058c..f16283b 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2095,6 +2095,10 @@ static bool isRemOfLoopIncrementWithLoopInvariant( if (!L->isLoopInvariant(RemAmt)) return false; + // Only works if the AddOffset is a loop invaraint + if (AddOffset && !L->isLoopInvariant(AddOffset)) + return false; + // Is the PHI a loop increment? auto LoopIncrInfo = getIVIncrement(PN, LI); if (!LoopIncrInfo) @@ -2765,6 +2769,29 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { return optimizeGatherScatterInst(II, II->getArgOperand(0)); case Intrinsic::masked_scatter: return optimizeGatherScatterInst(II, II->getArgOperand(1)); + case Intrinsic::masked_load: + // Treat v1X masked load as load X type. + if (auto *VT = dyn_cast<FixedVectorType>(II->getType())) { + if (VT->getNumElements() == 1) { + Value *PtrVal = II->getArgOperand(0); + unsigned AS = PtrVal->getType()->getPointerAddressSpace(); + if (optimizeMemoryInst(II, PtrVal, VT->getElementType(), AS)) + return true; + } + } + return false; + case Intrinsic::masked_store: + // Treat v1X masked store as store X type. + if (auto *VT = + dyn_cast<FixedVectorType>(II->getArgOperand(0)->getType())) { + if (VT->getNumElements() == 1) { + Value *PtrVal = II->getArgOperand(1); + unsigned AS = PtrVal->getType()->getPointerAddressSpace(); + if (optimizeMemoryInst(II, PtrVal, VT->getElementType(), AS)) + return true; + } + } + return false; } SmallVector<Value *, 2> PtrOps; diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 9512f79..810dc29 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -101,6 +101,7 @@ CGOPT(EABI, EABIVersion) CGOPT(DebuggerKind, DebuggerTuningOpt) CGOPT(bool, EnableStackSizeSection) CGOPT(bool, EnableAddrsig) +CGOPT(bool, EnableCallGraphSection) CGOPT(bool, EmitCallSiteInfo) CGOPT(bool, EnableMachineFunctionSplitter) CGOPT(bool, EnableStaticDataPartitioning) @@ -461,6 +462,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableAddrsig); + static cl::opt<bool> EnableCallGraphSection( + "call-graph-section", cl::desc("Emit a call graph section"), + cl::init(false)); + CGBINDOPT(EnableCallGraphSection); + static cl::opt<bool> EmitCallSiteInfo( "emit-call-site-info", cl::desc( @@ -595,6 +601,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter(); Options.EnableStaticDataPartitioning = getEnableStaticDataPartitioning(); Options.EmitAddrsig = getEnableAddrsig(); + Options.EmitCallGraphSection = getEnableCallGraphSection(); Options.EmitCallSiteInfo = getEmitCallSiteInfo(); Options.EnableDebugEntryValues = getEnableDebugEntryValues(); Options.ForceDwarfFrameSection = getForceDwarfFrameSection(); diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 8855740f..9b2851e 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -2186,19 +2186,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, llvm_unreachable("Deinterleave node should already have ReplacementNode"); break; case ComplexDeinterleavingOperation::Splat: { - auto *NewTy = VectorType::getDoubleElementsVectorType( - cast<VectorType>(Node->Real->getType())); auto *R = dyn_cast<Instruction>(Node->Real); auto *I = dyn_cast<Instruction>(Node->Imag); if (R && I) { // Splats that are not constant are interleaved where they are located Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode(); IRBuilder<> IRB(InsertPoint); - ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2, - NewTy, {Node->Real, Node->Imag}); + ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag}); } else { - ReplacementNode = Builder.CreateIntrinsic( - Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag}); + ReplacementNode = + Builder.CreateVectorInterleave({Node->Real, Node->Imag}); } break; } @@ -2226,10 +2223,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0); auto *A = replaceNode(Builder, Node->Operands[0]); auto *B = replaceNode(Builder, Node->Operands[1]); - auto *NewMaskTy = VectorType::getDoubleElementsVectorType( - cast<VectorType>(MaskReal->getType())); - auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, - NewMaskTy, {MaskReal, MaskImag}); + auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag}); ReplacementNode = Builder.CreateSelect(NewMask, A, B); break; } @@ -2260,8 +2254,8 @@ void ComplexDeinterleavingGraph::processReductionSingle( } if (!NewInit) - NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, - {Init, Constant::getNullValue(VTy)}); + NewInit = + Builder.CreateVectorInterleave({Init, Constant::getNullValue(VTy)}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); @@ -2281,16 +2275,12 @@ void ComplexDeinterleavingGraph::processReductionOperation( auto *OldPHIImag = ReductionInfo[Imag].first; auto *NewPHI = OldToNewPHI[OldPHIReal]; - auto *VTy = cast<VectorType>(Real->getType()); - auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); - // We have to interleave initial origin values coming from IncomingBlock Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming); Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming); IRBuilder<> Builder(Incoming->getTerminator()); - auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, - {InitReal, InitImag}); + auto *NewInit = Builder.CreateVectorInterleave({InitReal, InitImag}); NewPHI->addIncoming(NewInit, Incoming); NewPHI->addIncoming(OperationReplacement, BackEdge); diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 714ec55..1c1047c 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -103,10 +103,10 @@ static void expandFPToI(Instruction *FPToI) { Value *A1 = nullptr; if (FloatVal->getType()->isHalfTy()) { if (FPToI->getOpcode() == Instruction::FPToUI) { - Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getIntNTy(32)); + Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getInt32Ty()); A1 = Builder.CreateZExt(A0, IntTy); } else { // FPToSI - Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getIntNTy(32)); + Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getInt32Ty()); A1 = Builder.CreateSExt(A0, IntTy); } FPToI->replaceAllUsesWith(A1); @@ -425,8 +425,8 @@ static void expandIToFP(Instruction *IToFP) { AAddr0->addIncoming(IsSigned ? Sub : IntVal, IfThen4); AAddr0->addIncoming(Shl, SwBB); Value *A0 = Builder.CreateTrunc(AAddr0, Builder.getInt32Ty()); - Value *A1 = Builder.CreateLShr(A0, Builder.getIntN(32, 2)); - Value *A2 = Builder.CreateAnd(A1, Builder.getIntN(32, 1)); + Value *A1 = Builder.CreateLShr(A0, Builder.getInt32(2)); + Value *A2 = Builder.CreateAnd(A1, Builder.getInt32(1)); Value *Conv16 = Builder.CreateZExt(A2, IntTy); Value *Or17 = Builder.CreateOr(AAddr0, Conv16); Value *Inc = Builder.CreateAdd(Or17, Builder.getIntN(BitWidth, 1)); @@ -457,9 +457,9 @@ static void expandIToFP(Instruction *IToFP) { Value *Extract = Builder.CreateLShr(Shr21, Builder.getIntN(BitWidth, 32)); Value *ExtractT62 = nullptr; if (FloatWidth > 80) - ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getIntNTy(64)); + ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getInt64Ty()); else - ExtractT62 = Builder.CreateTrunc(Extract, Builder.getIntNTy(32)); + ExtractT62 = Builder.CreateTrunc(Extract, Builder.getInt32Ty()); Builder.CreateBr(IfEnd26); // if.else: @@ -475,7 +475,7 @@ static void expandIToFP(Instruction *IToFP) { Value *Extract65 = Builder.CreateLShr(Shl26, Builder.getIntN(BitWidth, 32)); Value *ExtractT66 = nullptr; if (FloatWidth > 80) - ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getIntNTy(64)); + ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getInt64Ty()); else ExtractT66 = Builder.CreateTrunc(Extract65, Builder.getInt32Ty()); Builder.CreateBr(IfEnd26); @@ -507,30 +507,29 @@ static void expandIToFP(Instruction *IToFP) { Builder.getIntN(BitWidth, 63)); And29 = Builder.CreateAnd(Shr, Temp2, "and29"); } else { - Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getIntNTy(32)); + Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getInt32Ty()); And29 = Builder.CreateAnd( - Conv28, ConstantInt::getSigned(Builder.getIntNTy(32), 0x80000000)); + Conv28, ConstantInt::getSigned(Builder.getInt32Ty(), 0x80000000)); } unsigned TempMod = FPMantissaWidth % 32; Value *And34 = nullptr; Value *Shl30 = nullptr; if (FloatWidth > 80) { TempMod += 32; - Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getIntN(64, TempMod)); + Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getInt64(TempMod)); Shl30 = Builder.CreateAdd( - Add, - Builder.getIntN(64, ((1ull << (62ull - TempMod)) - 1ull) << TempMod)); - And34 = Builder.CreateZExt(Shl30, Builder.getIntNTy(128)); + Add, Builder.getInt64(((1ull << (62ull - TempMod)) - 1ull) << TempMod)); + And34 = Builder.CreateZExt(Shl30, Builder.getInt128Ty()); } else { - Value *Add = Builder.CreateShl(E0, Builder.getIntN(32, TempMod)); + Value *Add = Builder.CreateShl(E0, Builder.getInt32(TempMod)); Shl30 = Builder.CreateAdd( - Add, Builder.getIntN(32, ((1 << (30 - TempMod)) - 1) << TempMod)); + Add, Builder.getInt32(((1 << (30 - TempMod)) - 1) << TempMod)); And34 = Builder.CreateAnd(FloatWidth > 32 ? AAddr1Off32 : AAddr1Off0, - Builder.getIntN(32, (1 << TempMod) - 1)); + Builder.getInt32((1 << TempMod) - 1)); } Value *Or35 = nullptr; if (FloatWidth > 80) { - Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getIntNTy(128)); + Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getInt128Ty()); Value *Or31 = Builder.CreateOr(And29Trunc, And34); Value *Or34 = Builder.CreateShl(Or31, Builder.getIntN(128, 64)); Value *Temp3 = Builder.CreateShl(Builder.getIntN(128, 1), diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 012d873..9ba1782 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -1009,7 +1009,8 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, for (unsigned I = 0; I < NumValues; ++I) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]); + MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy, + Offsets[I]); auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, MRI.getType(VRegs[I]), commonAlignment(BaseAlign, Offsets[I])); @@ -1039,7 +1040,8 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, for (unsigned I = 0; I < NumValues; ++I) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]); + MIRBuilder.materializeObjectPtrOffset(Addr, DemoteReg, OffsetLLTy, + Offsets[I]); auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, MRI.getType(VRegs[I]), commonAlignment(BaseAlign, Offsets[I])); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index e8f513a..e84ba91 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, const TargetOptions &Options = MF->getTarget().Options; LLT DstType = MRI.getType(MI.getOperand(0).getReg()); - if (CanReassociate && - !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc))) + if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc)) return false; // Floating-point multiply-add with intermediate rounding. @@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, if (!HasFMAD && !HasFMA) return false; - AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD; + AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD; // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract)) return false; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index dc5dfab..fd38c30 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1409,7 +1409,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr; for (unsigned i = 0; i < Regs.size(); ++i) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8); Align BaseAlign = getMemOpAlign(LI); @@ -1448,7 +1448,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { for (unsigned i = 0; i < Vals.size(); ++i) { Register Addr; - MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8); + MIRBuilder.materializeObjectPtrOffset(Addr, Base, OffsetTy, Offsets[i] / 8); MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8); Align BaseAlign = getMemOpAlign(SI); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index ed7b07f..d9d3569 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4170,7 +4170,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); - auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst); + auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrAddReg, PtrReg, OffsetCst); auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy, SmallPtr, *SmallMMO); @@ -4277,8 +4277,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) { LLT PtrTy = MRI.getType(PtrReg); auto OffsetCst = MIRBuilder.buildConstant( LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); - auto SmallPtr = - MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); + auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrTy, PtrReg, OffsetCst); MachineMemOperand *LargeMMO = MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); @@ -5349,7 +5348,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx, unsigned ByteOffset = Offset / 8; Register NewAddrReg; - MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); + MIRBuilder.materializeObjectPtrOffset(NewAddrReg, AddrReg, OffsetTy, + ByteOffset); MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, ByteOffset, PartTy); @@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. return UnableToLegalize; - if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) { + if (MI.getFlag(MachineInstr::FmAfn)) { unsigned Flags = MI.getFlags(); auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags); MIRBuilder.buildFPTrunc(Dst, Src32, Flags); @@ -9822,7 +9822,7 @@ LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val, if (DstOff != 0) { auto Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); - Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); + Ptr = MIB.buildObjectPtrOffset(PtrTy, Dst, Offset).getReg(0); } MIB.buildStore(Value, Ptr, *StoreMMO); @@ -9962,7 +9962,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, LLT SrcTy = MRI.getType(Src); Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset) .getReg(0); - LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); + LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0); } auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); @@ -9970,7 +9970,7 @@ LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, Register StorePtr = Dst; if (CurrOffset != 0) { LLT DstTy = MRI.getType(Dst); - StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); + StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0); } MIB.buildStore(LdVal, StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); @@ -10060,7 +10060,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, LLT SrcTy = MRI.getType(Src); auto Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset); - LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); + LoadPtr = MIB.buildObjectPtrOffset(SrcTy, Src, Offset).getReg(0); } LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); CurrOffset += CopyTy.getSizeInBytes(); @@ -10078,7 +10078,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, LLT DstTy = MRI.getType(Dst); auto Offset = MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset); - StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); + StorePtr = MIB.buildObjectPtrOffset(DstTy, Dst, Offset).getReg(0); } MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); CurrOffset += CopyTy.getSizeInBytes(); diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 121d7e8..27df7e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -208,11 +208,20 @@ MachineIRBuilder::buildPtrAdd(const DstOp &Res, const SrcOp &Op0, return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1}, Flags); } +MachineInstrBuilder MachineIRBuilder::buildObjectPtrOffset(const DstOp &Res, + const SrcOp &Op0, + const SrcOp &Op1) { + return buildPtrAdd(Res, Op0, Op1, + MachineInstr::MIFlag::NoUWrap | + MachineInstr::MIFlag::InBounds); +} + std::optional<MachineInstrBuilder> MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0, - const LLT ValueTy, uint64_t Value) { + const LLT ValueTy, uint64_t Value, + std::optional<unsigned> Flags) { assert(Res == 0 && "Res is a result argument"); - assert(ValueTy.isScalar() && "invalid offset type"); + assert(ValueTy.isScalar() && "invalid offset type"); if (Value == 0) { Res = Op0; @@ -221,7 +230,14 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0, Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0)); auto Cst = buildConstant(ValueTy, Value); - return buildPtrAdd(Res, Op0, Cst.getReg(0)); + return buildPtrAdd(Res, Op0, Cst.getReg(0), Flags); +} + +std::optional<MachineInstrBuilder> MachineIRBuilder::materializeObjectPtrOffset( + Register &Res, Register Op0, const LLT ValueTy, uint64_t Value) { + return materializePtrAdd(Res, Op0, ValueTy, Value, + MachineInstr::MIFlag::NoUWrap | + MachineInstr::MIFlag::InBounds); } MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res, diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 65565b9..5e50898 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -587,6 +587,27 @@ static Value *getMask(Value *WideMask, unsigned Factor, } } + if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) { + // Check that the shuffle mask is: a) an interleave, b) all of the same + // set of the elements, and c) contained by the first source. (c) could + // be relaxed if desired. + unsigned NumSrcElts = + cast<FixedVectorType>(SVI->getOperand(1)->getType())->getNumElements(); + SmallVector<unsigned> StartIndexes; + if (ShuffleVectorInst::isInterleaveMask(SVI->getShuffleMask(), Factor, + NumSrcElts * 2, StartIndexes) && + llvm::all_of(StartIndexes, [](unsigned Start) { return Start == 0; }) && + llvm::all_of(SVI->getShuffleMask(), [&NumSrcElts](int Idx) { + return Idx < (int)NumSrcElts; + })) { + auto *LeafMaskTy = + VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC); + IRBuilder<> Builder(SVI); + return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), + uint64_t(0)); + } + } + return nullptr; } @@ -613,6 +634,9 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( << " and factor = " << Factor << "\n"); } else { assert(II); + if (II->getIntrinsicID() != Intrinsic::masked_load && + II->getIntrinsicID() != Intrinsic::vp_load) + return false; // Check mask operand. Handle both all-true/false and interleaved mask. Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); @@ -652,6 +676,9 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( Value *Mask = nullptr; if (II) { + if (II->getIntrinsicID() != Intrinsic::masked_store && + II->getIntrinsicID() != Intrinsic::vp_store) + return false; // Check mask operand. Handle both all-true/false and interleaved mask. Mask = getMask(getMaskOperand(II), Factor, cast<VectorType>(InterleaveValues[0]->getType())); diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 7153902..8b72c29 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -217,6 +217,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("nneg", MIToken::kw_nneg) .Case("disjoint", MIToken::kw_disjoint) .Case("samesign", MIToken::kw_samesign) + .Case("inbounds", MIToken::kw_inbounds) .Case("nofpexcept", MIToken::kw_nofpexcept) .Case("unpredictable", MIToken::kw_unpredictable) .Case("debug-location", MIToken::kw_debug_location) @@ -616,6 +617,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) { .Case("!range", MIToken::md_range) .Case("!DIExpression", MIToken::md_diexpr) .Case("!DILocation", MIToken::md_dilocation) + .Case("!noalias.addrspace", MIToken::md_noalias_addrspace) .Default(MIToken::Error); } diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h index d7cd067..0627f17 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -78,6 +78,7 @@ struct MIToken { kw_nneg, kw_disjoint, kw_samesign, + kw_inbounds, kw_debug_location, kw_debug_instr_number, kw_dbg_instr_ref, @@ -151,6 +152,7 @@ struct MIToken { md_tbaa, md_alias_scope, md_noalias, + md_noalias_addrspace, md_range, md_diexpr, md_dilocation, diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 3a364d5..6a464d9 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1477,7 +1477,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Token.is(MIToken::kw_nneg) || Token.is(MIToken::kw_disjoint) || Token.is(MIToken::kw_nusw) || - Token.is(MIToken::kw_samesign)) { + Token.is(MIToken::kw_samesign) || + Token.is(MIToken::kw_inbounds)) { // clang-format on // Mine frame and fast math flags if (Token.is(MIToken::kw_frame_setup)) @@ -1518,6 +1519,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Flags |= MachineInstr::NoUSWrap; if (Token.is(MIToken::kw_samesign)) Flags |= MachineInstr::SameSign; + if (Token.is(MIToken::kw_inbounds)) + Flags |= MachineInstr::InBounds; lex(); } @@ -3482,6 +3485,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { if (parseMDNode(AAInfo.NoAlias)) return true; break; + case MIToken::md_noalias_addrspace: + lex(); + if (parseMDNode(AAInfo.NoAliasAddrSpace)) + return true; + break; case MIToken::md_range: lex(); if (parseMDNode(Range)) @@ -3490,7 +3498,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { // TODO: Report an error on duplicate metadata nodes. default: return error("expected 'align' or '!tbaa' or '!alias.scope' or " - "'!noalias' or '!range'"); + "'!noalias' or '!range' or '!noalias.addrspace'"); } } if (expectAndConsume(MIToken::rparen)) diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 1e9fcf3..3e99e57 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -504,13 +504,21 @@ bool MIRParserImpl::initializeCallSiteInfo( return error(Error, ArgRegPair.Reg.SourceRange); CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo); } + if (!YamlCSInfo.CalleeTypeIds.empty()) { + for (auto CalleeTypeId : YamlCSInfo.CalleeTypeIds) { + IntegerType *Int64Ty = Type::getInt64Ty(Context); + CSInfo.CalleeTypeIds.push_back(ConstantInt::get(Int64Ty, CalleeTypeId, + /*isSigned=*/false)); + } + } - if (TM.Options.EmitCallSiteInfo) + if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection) MF.addCallSiteInfo(&*CallI, std::move(CSInfo)); } - if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo) - return error(Twine("Call site info provided but not used")); + if (!YamlMF.CallSitesInfo.empty() && + !(TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)) + return error("call site info provided but not used"); return false; } diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index bc4e299..ce1834a 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -525,24 +525,30 @@ static void convertCallSiteObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST) { const auto *TRI = MF.getSubtarget().getRegisterInfo(); - for (auto CSInfo : MF.getCallSitesInfo()) { + for (auto [MI, CallSiteInfo] : MF.getCallSitesInfo()) { yaml::CallSiteInfo YmlCS; yaml::MachineInstrLoc CallLocation; // Prepare instruction position. - MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator(); + MachineBasicBlock::const_instr_iterator CallI = MI->getIterator(); CallLocation.BlockNum = CallI->getParent()->getNumber(); // Get call instruction offset from the beginning of block. CallLocation.Offset = std::distance(CallI->getParent()->instr_begin(), CallI); YmlCS.CallLocation = CallLocation; + + auto [ArgRegPairs, CalleeTypeIds] = CallSiteInfo; // Construct call arguments and theirs forwarding register info. - for (auto ArgReg : CSInfo.second.ArgRegPairs) { + for (auto ArgReg : ArgRegPairs) { yaml::CallSiteInfo::ArgRegPair YmlArgReg; YmlArgReg.ArgNo = ArgReg.ArgNo; printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI); YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg); } + // Get type ids. + for (auto *CalleeTypeId : CalleeTypeIds) { + YmlCS.CalleeTypeIds.push_back(CalleeTypeId->getZExtValue()); + } YMF.CallSitesInfo.push_back(std::move(YmlCS)); } @@ -814,6 +820,8 @@ static void printMI(raw_ostream &OS, MFPrintState &State, OS << "nusw "; if (MI.getFlag(MachineInstr::SameSign)) OS << "samesign "; + if (MI.getFlag(MachineInstr::InBounds)) + OS << "inbounds "; // NOTE: Please add new MIFlags also to the MI_FLAGS_STR in // llvm/utils/update_mir_test_checks.py. diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 429a17a..60d42e0 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -211,8 +211,7 @@ void MachineFunction::init() { ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); - // FIXME: Use Function::hasOptSize(). - if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.getAlign() && !F.hasOptSize()) Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); @@ -920,7 +919,7 @@ MachineFunction::getCallSiteInfo(const MachineInstr *MI) { assert(MI->isCandidateForAdditionalCallInfo() && "Call site info refers only to call (MI) candidates"); - if (!Target.Options.EmitCallSiteInfo) + if (!Target.Options.EmitCallSiteInfo && !Target.Options.EmitCallGraphSection) return CallSitesInfo.end(); return CallSitesInfo.find(MI); } diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index da3665b..79047f7 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -585,6 +585,8 @@ uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { MIFlags |= MachineInstr::MIFlag::NoUSWrap; if (GEP->hasNoUnsignedWrap()) MIFlags |= MachineInstr::MIFlag::NoUWrap; + if (GEP->isInBounds()) + MIFlags |= MachineInstr::MIFlag::InBounds; } // Copy the nonneg flag. @@ -1860,8 +1862,12 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << "nneg "; if (getFlag(MachineInstr::Disjoint)) OS << "disjoint "; + if (getFlag(MachineInstr::NoUSWrap)) + OS << "nusw "; if (getFlag(MachineInstr::SameSign)) OS << "samesign "; + if (getFlag(MachineInstr::InBounds)) + OS << "inbounds "; // Print the opcode name. if (TII) diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 0d25169..c612f8de 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << ", !noalias "; AAInfo.NoAlias->printAsOperand(OS, MST); } + if (AAInfo.NoAliasAddrSpace) { + OS << ", !noalias.addrspace "; + AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST); + } if (getRanges()) { OS << ", !range "; getRanges()->printAsOperand(OS, MST); diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index 0f742c4..21bf052 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -423,7 +423,7 @@ void ModuloScheduleExpander::generateExistingPhis( // potentially define two values. unsigned MaxPhis = PrologStage + 2; if (!InKernel && (int)PrologStage <= LoopValStage) - MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1); + MaxPhis = std::max((int)MaxPhis - LoopValStage, 1); unsigned NumPhis = std::min(NumStages, MaxPhis); Register NewReg; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 2d7987a..7ede564 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -306,7 +306,12 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate { /// number if it is not zero. If DstReg is a physical register and the /// existing subregister number of the def / use being updated is not zero, /// make sure to set it to the correct physical subregister. - void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx); + /// + /// If \p SubregToRegSrcInst is not empty, we are coalescing a + /// `DstReg = SUBREG_TO_REG SrcReg`, which should introduce an + /// implicit-def of DstReg on instructions that define SrcReg. + void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx, + ArrayRef<MachineInstr *> SubregToRegSrcInst = {}); /// If the given machine operand reads only undefined lanes add an undef /// flag. @@ -1443,6 +1448,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // CopyMI may have implicit operands, save them so that we can transfer them // over to the newly materialized instruction after CopyMI is removed. + LaneBitmask NewMIImplicitOpsMask; SmallVector<MachineOperand, 4> ImplicitOps; ImplicitOps.reserve(CopyMI->getNumOperands() - CopyMI->getDesc().getNumOperands()); @@ -1457,6 +1463,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, (MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) && "unexpected implicit virtual register def"); ImplicitOps.push_back(MO); + if (MO.isDef() && MO.getReg().isVirtual() && + MRI->shouldTrackSubRegLiveness(DstReg)) + NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); } } @@ -1499,14 +1508,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, } else { assert(MO.getReg() == NewMI.getOperand(0).getReg()); - // We're only expecting another def of the main output, so the range - // should get updated with the regular output range. - // - // FIXME: The range updating below probably needs updating to look at - // the super register if subranges are tracked. - assert(!MRI->shouldTrackSubRegLiveness(DstReg) && - "subrange update for implicit-def of super register may not be " - "properly handled"); + // If lanemasks need to be tracked, compile the lanemask of the NewMI + // implicit def operands to avoid subranges for the super-regs from + // being removed by code later on in this function. + if (MRI->shouldTrackSubRegLiveness(MO.getReg())) + NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg()); } } } @@ -1606,7 +1612,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); for (LiveInterval::SubRange &SR : DstInt.subranges()) { - if ((SR.LaneMask & DstMask).none()) { + if ((SR.LaneMask & DstMask).none() && + (SR.LaneMask & NewMIImplicitOpsMask).none()) { LLVM_DEBUG(dbgs() << "Removing undefined SubRange " << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n"); @@ -1870,11 +1877,14 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx, } } -void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, - unsigned SubIdx) { +void RegisterCoalescer::updateRegDefsUses( + Register SrcReg, Register DstReg, unsigned SubIdx, + ArrayRef<MachineInstr *> SubregToRegSrcInsts) { bool DstIsPhys = DstReg.isPhysical(); LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg); + // Coalescing a COPY may expose reads of 'undef' subregisters. + // If so, then explicitly propagate 'undef' to those operands. if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) { for (MachineOperand &MO : MRI->reg_operands(DstReg)) { if (MO.isUndef()) @@ -1891,6 +1901,15 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, } } + // If DstInt already has a subrange for the unused lanes, then we shouldn't + // create duplicate subranges when we update the interval for unused lanes. + LaneBitmask DstIntLaneMask; + if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { + for (LiveInterval::SubRange &SR : DstInt->subranges()) + DstIntLaneMask |= SR.LaneMask; + } + + // Go through all instructions to replace uses of 'SrcReg' by 'DstReg'. SmallPtrSet<MachineInstr *, 8> Visited; for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end(); @@ -1914,6 +1933,80 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr()) Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI)); + bool RequiresImplicitRedef = false; + if (!SubregToRegSrcInsts.empty()) { + // We can only add an implicit-def and undef if the sub registers match, + // e.g. + // %0:gr32 = INSTX + // %0.sub8:gr32 = INSTY // top 24 bits of %0 still defined + // %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub32 + // + // This cannot be transformed into: + // %1.sub32:gr64 = INSTX + // undef %1.sub8:gr64 = INSTY , implicit-def %1 + // + // Because that would thrash the top 24 bits of %1.sub32. + if (is_contained(SubregToRegSrcInsts, UseMI) && + all_of(UseMI->defs(), + [&SubIdx, &SrcReg](const MachineOperand &MO) -> bool { + if (MO.getReg() != SrcReg || !MO.getSubReg() || MO.isUndef()) + return true; + return SubIdx == MO.getSubReg(); + })) { + // Add implicit-def of super-register to express that the whole + // register is defined by the instruction. + MachineInstrBuilder MIB(*MF, UseMI); + MIB.addReg(DstReg, RegState::ImplicitDefine); + RequiresImplicitRedef = true; + } + + // If the coalesed instruction doesn't fully define the register, we need + // to preserve the original super register liveness for SUBREG_TO_REG. + // + // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes, + // but it introduces liveness for other subregisters. Downstream users may + // have been relying on those bits, so we need to ensure their liveness is + // captured with a def of other lanes. + if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) { + // First check if there is sufficient granularity in terms of subranges. + LaneBitmask DstMask = MRI->getMaxLaneMaskForVReg(DstInt->reg()); + LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx); + LaneBitmask UnusedLanes = DstMask & ~UsedLanes; + if ((UnusedLanes & ~DstIntLaneMask).any()) { + BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); + DstInt->createSubRangeFrom(Allocator, UnusedLanes, *DstInt); + DstIntLaneMask |= UnusedLanes; + } + + // After duplicating the live ranges for the low/hi bits, we + // need to update the subranges of the DstReg interval such that + // for a case like this: + // + // entry: + // 16B %1:gpr32 = INSTRUCTION (<=> UseMI) + // : + // if.then: + // 32B %1:gpr32 = MOVIMM32 .. + // 48B %0:gpr64 = SUBREG_TO_REG 0, %1, sub32 + // + // Only the MOVIMM32 require a def of the top lanes and any intervals + // for the top 32-bits of the def at 16B should be removed. + for (LiveInterval::SubRange &SR : DstInt->subranges()) { + if (!Writes || RequiresImplicitRedef || + (SR.LaneMask & UnusedLanes).none()) + continue; + + assert((SR.LaneMask & UnusedLanes) == SR.LaneMask && + "Unexpected lanemask. Subrange needs finer granularity"); + + SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(false); + auto SegmentI = SR.find(UseIdx); + if (SegmentI != SR.end()) + SR.removeSegment(SegmentI, true); + } + } + } + // Replace SrcReg with DstReg in all UseMI operands. for (unsigned Op : Ops) { MachineOperand &MO = UseMI->getOperand(Op); @@ -1922,7 +2015,7 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg, // turn a full def into a read-modify-write sub-register def and vice // versa. if (SubIdx && MO.isDef()) - MO.setIsUndef(!Reads); + MO.setIsUndef(!Reads || RequiresImplicitRedef); // A subreg use of a partially undef (super) register may be a complete // undef use now and then has to be marked that way. @@ -2025,6 +2118,30 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI, LIS->shrinkToUses(&LI); } +/// For a given use of value \p Idx, it returns the def in the current block, +/// or otherwise all possible defs in preceding blocks. +static bool FindDefInBlock(SmallPtrSetImpl<MachineBasicBlock *> &VisitedBlocks, + SmallVector<MachineInstr *> &Instrs, + LiveIntervals *LIS, LiveInterval &SrcInt, + MachineBasicBlock *MBB, VNInfo *Idx) { + if (!Idx->isPHIDef()) { + MachineInstr *Def = LIS->getInstructionFromIndex(Idx->def); + assert(Def && "Unable to find a def for SUBREG_TO_REG source operand"); + Instrs.push_back(Def); + return true; + } + + bool Any = false; + if (VisitedBlocks.count(MBB)) + return false; + VisitedBlocks.insert(MBB); + for (MachineBasicBlock *Pred : MBB->predecessors()) { + Any |= FindDefInBlock(VisitedBlocks, Instrs, LIS, SrcInt, Pred, + SrcInt.getVNInfoBefore(LIS->getMBBEndIdx(Pred))); + } + return Any; +} + bool RegisterCoalescer::joinCopy( MachineInstr *CopyMI, bool &Again, SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) { @@ -2156,6 +2273,35 @@ bool RegisterCoalescer::joinCopy( }); } + SmallVector<MachineInstr *> SubregToRegSrcInsts; + if (CopyMI->isSubregToReg()) { + // For the case where the copy instruction is a SUBREG_TO_REG, e.g. + // + // %0:gpr32 = movimm32 .. + // %1:gpr64 = SUBREG_TO_REG 0, %0, sub32 + // ... + // %0:gpr32 = COPY <something> + // + // After joining liveranges, the original `movimm32` will need an + // implicit-def to make it explicit that the entire register is written, + // i.e. + // + // undef %0.sub32:gpr64 = movimm32 ..., implicit-def %0 + // ... + // undef %0.sub32:gpr64 = COPY <something> // Note that this does not + // // require an implicit-def, + // // because it has nothing to + // // do with the SUBREG_TO_REG. + LiveInterval &SrcInt = + LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg()); + SlotIndex SubregToRegSlotIdx = LIS->getInstructionIndex(*CopyMI); + SmallPtrSet<MachineBasicBlock *, 8> VisitedBlocks; + if (!FindDefInBlock(VisitedBlocks, SubregToRegSrcInsts, LIS, SrcInt, + CopyMI->getParent(), + SrcInt.Query(SubregToRegSlotIdx).valueIn())) + llvm_unreachable("SUBREG_TO_REG src requires a def"); + } + ShrinkMask = LaneBitmask::getNone(); ShrinkMainRange = false; @@ -2225,9 +2371,12 @@ bool RegisterCoalescer::joinCopy( // Rewrite all SrcReg operands to DstReg. // Also update DstReg operands to include DstIdx if it is set. - if (CP.getDstIdx()) + if (CP.getDstIdx()) { + assert(SubregToRegSrcInsts.empty() && "can this happen?"); updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx()); - updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx()); + } + updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(), + SubregToRegSrcInsts); // Shrink subregister ranges if necessary. if (ShrinkMask.any()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d3df434..a43020e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -35,6 +35,7 @@ #include "llvm/CodeGen/ByteProvider.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/SDPatternMatch.h" @@ -15262,23 +15263,31 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) { } } - // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller - // than X, and the And doesn't change the lower iX bits, we can move the - // AssertZext in front of the And and drop the AssertSext. if (Opcode == ISD::AssertZext && N0.getOpcode() == ISD::AND && - N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::AssertSext && isa<ConstantSDNode>(N0.getOperand(1))) { - SDValue BigA = N0.getOperand(0); - EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); const APInt &Mask = N0.getConstantOperandAPInt(1); - if (AssertVT.bitsLT(BigA_AssertVT) && - Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { - SDLoc DL(N); - SDValue NewAssert = - DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); - return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, - N0.getOperand(1)); + + // If we have (AssertZext (and (AssertSext X, iX), M), iY) and Y is smaller + // than X, and the And doesn't change the lower iX bits, we can move the + // AssertZext in front of the And and drop the AssertSext. + if (N0.getOperand(0).getOpcode() == ISD::AssertSext && N0.hasOneUse()) { + SDValue BigA = N0.getOperand(0); + EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); + if (AssertVT.bitsLT(BigA_AssertVT) && + Mask.countr_one() >= BigA_AssertVT.getScalarSizeInBits()) { + SDLoc DL(N); + SDValue NewAssert = + DAG.getNode(Opcode, DL, N->getValueType(0), BigA.getOperand(0), N1); + return DAG.getNode(ISD::AND, DL, N->getValueType(0), NewAssert, + N0.getOperand(1)); + } } + + // Remove AssertZext entirely if the mask guarantees the assertion cannot + // fail. + // TODO: Use KB countMinLeadingZeros to handle non-constant masks? + if (Mask.isIntN(AssertVT.getScalarSizeInBits())) + return N0; } return SDValue(); @@ -22778,8 +22787,10 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); // If we store purely within object bounds just before its lifetime ends, // we can remove the store. - if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, - StoreSize.getFixedValue() * 8)) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + if (LifetimeEndBase.contains( + DAG, MFI.getObjectSize(LifetimeEnd->getFrameIndex()) * 8, + StoreBase, StoreSize.getFixedValue() * 8)) { LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); dbgs() << "\nwithin LIFETIME_END of : "; LifetimeEndBase.dump(); dbgs() << "\n"); @@ -28971,13 +28982,100 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, return SDValue(); } +static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, + const TargetLowering &TLI) { + // Match a pattern such as: + // (X | (X >> C0) | (X >> C1) | ...) & Mask + // This extracts contiguous parts of X and ORs them together before comparing. + // We can optimize this so that we directly check (X & SomeMask) instead, + // eliminating the shifts. + + EVT VT = Root.getValueType(); + + // TODO: Support vectors? + if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N0 = Root.getOperand(0); + SDValue N1 = Root.getOperand(1); + + if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1)) + return SDValue(); + + APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal(); + + SDValue Src; + const auto IsSrc = [&](SDValue V) { + if (!Src) { + Src = V; + return true; + } + + return Src == V; + }; + + SmallVector<SDValue> Worklist = {N0}; + APInt PartsMask(VT.getSizeInBits(), 0); + while (!Worklist.empty()) { + SDValue V = Worklist.pop_back_val(); + if (!V.hasOneUse() && (Src && Src != V)) + return SDValue(); + + if (V.getOpcode() == ISD::OR) { + Worklist.push_back(V.getOperand(0)); + Worklist.push_back(V.getOperand(1)); + continue; + } + + if (V.getOpcode() == ISD::SRL) { + SDValue ShiftSrc = V.getOperand(0); + SDValue ShiftAmt = V.getOperand(1); + + if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt)) + return SDValue(); + + auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal(); + if (ShiftAmtVal > RootMask.getBitWidth()) + return SDValue(); + + PartsMask |= (RootMask << ShiftAmtVal); + continue; + } + + if (IsSrc(V)) { + PartsMask |= RootMask; + continue; + } + + return SDValue(); + } + + if (!Src) + return SDValue(); + + SDLoc DL(Root); + return DAG.getNode(ISD::AND, DL, VT, + {Src, DAG.getConstant(PartsMask, DL, VT)}); +} + /// This is a stub for TargetLowering::SimplifySetCC. SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans) { TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); - return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); + if (SDValue C = + TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL)) + return C; + + if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND && + isNullConstant(N1)) { + + if (SDValue Res = matchMergedBFX(N0, DAG, TLI)) + return DAG.getSetCC(DL, VT, Res, N1, Cond); + } + + return SDValue(); } /// Given an ISD::SDIV node expressing a divide by constant, return @@ -29415,7 +29513,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { MachineMemOperand *MMO; }; - auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics { + auto getCharacteristics = [this](SDNode *N) -> MemUseCharacteristics { if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) { int64_t Offset = 0; if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset())) @@ -29428,13 +29526,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const { LSN->getBasePtr(), Offset /*base offset*/, LocationSize::precise(Size), LSN->getMemOperand()}; } - if (const auto *LN = cast<LifetimeSDNode>(N)) + if (const auto *LN = cast<LifetimeSDNode>(N)) { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1), 0, - LocationSize::precise(LN->getSize()), + LocationSize::precise(MFI.getObjectSize(LN->getFrameIndex())), (MachineMemOperand *)nullptr}; + } // Default. return {false /*isvolatile*/, /*isAtomic*/ false, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 74172b2..ba0ab23 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { break; case ISD::FP_TO_FP16: LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n"); - if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { + if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) { SDValue Op = Node->getOperand(0); MVT SVT = Op.getSimpleValueType(); if ((SVT == MVT::f64 || SVT == MVT::f80) && diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index e5704c0..583a85a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" @@ -357,6 +358,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::PATCHPOINT: Res = PromoteIntRes_PATCHPOINT(N); break; + case ISD::READ_REGISTER: + Res = PromoteIntRes_READ_REGISTER(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -2076,6 +2080,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::PATCHPOINT: Res = PromoteIntOp_PATCHPOINT(N, OpNo); break; + case ISD::WRITE_REGISTER: + Res = PromoteIntOp_WRITE_REGISTER(N, OpNo); + break; case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: case ISD::EXPERIMENTAL_VP_STRIDED_STORE: Res = PromoteIntOp_VP_STRIDED(N, OpNo); @@ -2853,6 +2860,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) { return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_WRITE_REGISTER(SDNode *N, + unsigned OpNo) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.write_register with illegal type", Fn, + N->getDebugLoc())); + return N->getOperand(0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) { assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) || (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4)); @@ -3127,6 +3143,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::VSCALE: ExpandIntRes_VSCALE(N, Lo, Hi); break; + + case ISD::READ_REGISTER: + ExpandIntRes_READ_REGISTER(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -5471,6 +5491,18 @@ void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo, SplitInteger(Res, Lo, Hi); } +void DAGTypeLegalizer::ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, + SDValue &Hi) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc())); + ReplaceValueWith(SDValue(N, 1), N->getOperand(0)); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + Lo = DAG.getPOISON(LoVT); + Hi = DAG.getPOISON(HiVT); +} + //===----------------------------------------------------------------------===// // Integer Operand Expansion //===----------------------------------------------------------------------===// @@ -5537,6 +5569,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::EXPERIMENTAL_VP_STRIDED_STORE: Res = ExpandIntOp_VP_STRIDED(N, OpNo); break; + case ISD::WRITE_REGISTER: + Res = ExpandIntOp_WRITE_REGISTER(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -5935,6 +5970,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) { return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.write_register with illegal type", Fn, + N->getDebugLoc())); + + return N->getOperand(0); +} + SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) { SDLoc dl(N); @@ -6332,6 +6376,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_PATCHPOINT(SDNode *N) { return Res.getValue(0); } +SDValue DAGTypeLegalizer::PromoteIntRes_READ_REGISTER(SDNode *N) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc())); + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + ReplaceValueWith(SDValue(N, 1), N->getOperand(0)); + return DAG.getPOISON(NVT); +} + SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9b53724..2e13b18 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -378,6 +378,7 @@ private: SDValue PromoteIntRes_VPFunnelShift(SDNode *N); SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); + SDValue PromoteIntRes_READ_REGISTER(SDNode *N); SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); @@ -428,6 +429,7 @@ private: SDValue PromoteIntOp_SET_ROUNDING(SDNode *N); SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); @@ -511,6 +513,7 @@ private: void ExpandIntRes_FunnelShift (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_VSCALE (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandShiftByConstant(SDNode *N, const APInt &Amt, SDValue &Lo, SDValue &Hi); @@ -534,6 +537,7 @@ private: SDValue ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo); SDValue ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo); SDValue ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo); + SDValue ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo); void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &dl); diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 6a2e782..31e7855 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -888,7 +888,8 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { } if (MI->isCandidateForAdditionalCallInfo()) { - if (DAG->getTarget().Options.EmitCallSiteInfo) + if (DAG->getTarget().Options.EmitCallSiteInfo || + DAG->getTarget().Options.EmitCallGraphSection) MF.addCallSiteInfo(MI, DAG->getCallSiteInfo(Node)); if (auto CalledGlobal = DAG->getCalledGlobal(Node)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 773ff48..02d1100 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -784,10 +784,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { case ISD::TargetFrameIndex: ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex()); break; - case ISD::LIFETIME_START: - case ISD::LIFETIME_END: - ID.AddInteger(cast<LifetimeSDNode>(N)->getSize()); - break; case ISD::PSEUDO_PROBE: ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid()); ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex()); @@ -7847,20 +7843,43 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } } - // Perform trivial constant folding. - if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) - return SV; + if (N1.getOpcode() == ISD::POISON || N2.getOpcode() == ISD::POISON) { + switch (Opcode) { + case ISD::XOR: + case ISD::ADD: + case ISD::PTRADD: + case ISD::SUB: + case ISD::SIGN_EXTEND_INREG: + case ISD::UDIV: + case ISD::SDIV: + case ISD::UREM: + case ISD::SREM: + case ISD::MUL: + case ISD::AND: + case ISD::SSUBSAT: + case ISD::USUBSAT: + case ISD::UMIN: + case ISD::OR: + case ISD::SADDSAT: + case ISD::UADDSAT: + case ISD::UMAX: + case ISD::SMAX: + case ISD::SMIN: + // fold op(arg1, poison) -> poison, fold op(poison, arg2) -> poison. + return N2.getOpcode() == ISD::POISON ? N2 : N1; + } + } // Canonicalize an UNDEF to the RHS, even over a constant. - if (N1.isUndef()) { + if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() != ISD::UNDEF) { if (TLI->isCommutativeBinOp(Opcode)) { std::swap(N1, N2); } else { switch (Opcode) { case ISD::PTRADD: case ISD::SUB: - // fold op(undef, arg2) -> undef, fold op(poison, arg2) ->poison. - return N1.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT); + // fold op(undef, non_undef_arg2) -> undef. + return N1; case ISD::SIGN_EXTEND_INREG: case ISD::UDIV: case ISD::SDIV: @@ -7868,18 +7887,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::SREM: case ISD::SSUBSAT: case ISD::USUBSAT: - // fold op(undef, arg2) -> 0, fold op(poison, arg2) -> poison. - return N1.getOpcode() == ISD::POISON ? getPOISON(VT) - : getConstant(0, DL, VT); + // fold op(undef, non_undef_arg2) -> 0. + return getConstant(0, DL, VT); } } } // Fold a bunch of operators when the RHS is undef. - if (N2.isUndef()) { + if (N2.getOpcode() == ISD::UNDEF) { switch (Opcode) { case ISD::XOR: - if (N1.isUndef()) + if (N1.getOpcode() == ISD::UNDEF) // Handle undef ^ undef -> 0 special case. This is a common // idiom (misuse). return getConstant(0, DL, VT); @@ -7887,29 +7905,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::ADD: case ISD::PTRADD: case ISD::SUB: + // fold op(arg1, undef) -> undef. + return N2; case ISD::UDIV: case ISD::SDIV: case ISD::UREM: case ISD::SREM: - // fold op(arg1, undef) -> undef, fold op(arg1, poison) -> poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) : getUNDEF(VT); + // fold op(arg1, undef) -> poison. + return getPOISON(VT); case ISD::MUL: case ISD::AND: case ISD::SSUBSAT: case ISD::USUBSAT: - // fold op(arg1, undef) -> 0, fold op(arg1, poison) -> poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) - : getConstant(0, DL, VT); + case ISD::UMIN: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> 0. + return N1.getOpcode() == ISD::UNDEF ? N2 : getConstant(0, DL, VT); case ISD::OR: case ISD::SADDSAT: case ISD::UADDSAT: - // fold op(arg1, undef) -> an all-ones constant, fold op(arg1, poison) -> - // poison. - return N2.getOpcode() == ISD::POISON ? getPOISON(VT) - : getAllOnesConstant(DL, VT); + case ISD::UMAX: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> -1. + return N1.getOpcode() == ISD::UNDEF ? N2 : getAllOnesConstant(DL, VT); + case ISD::SMAX: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MAX_INT. + return N1.getOpcode() == ISD::UNDEF + ? N2 + : getConstant( + APInt::getSignedMaxValue(VT.getScalarSizeInBits()), DL, + VT); + case ISD::SMIN: + // fold op(undef, undef) -> undef, fold op(arg1, undef) -> MIN_INT. + return N1.getOpcode() == ISD::UNDEF + ? N2 + : getConstant( + APInt::getSignedMinValue(VT.getScalarSizeInBits()), DL, + VT); } } + // Perform trivial constant folding. + if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) + return SV; + // Memoize this node if possible. SDNode *N; SDVTList VTs = getVTList(VT); @@ -9360,8 +9397,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, } SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, - SDValue Chain, int FrameIndex, - int64_t Size) { + SDValue Chain, int FrameIndex) { const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END; const auto VTs = getVTList(MVT::Other); SDValue Ops[2] = { @@ -9373,13 +9409,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl, FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); ID.AddInteger(FrameIndex); - ID.AddInteger(Size); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) return SDValue(E, 0); - LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), - dl.getDebugLoc(), VTs, Size); + LifetimeSDNode *N = + newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs); createOperands(N, Ops); CSEMap.InsertNode(N, IP); InsertNode(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1636465..306e068 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) { // FPTrunc is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); SDLoc dl = getCurSDLoc(); + SDNodeFlags Flags; + if (auto *TruncInst = dyn_cast<FPMathOperator>(&I)) + Flags.copyFMF(*TruncInst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N, DAG.getTargetConstant( - 0, dl, TLI.getPointerTy(DAG.getDataLayout())))); + 0, dl, TLI.getPointerTy(DAG.getDataLayout())), + Flags)); } void SelectionDAGBuilder::visitFPExt(const User &I) { @@ -7594,8 +7598,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, if (TM.getOptLevel() == CodeGenOptLevel::None) return; - const int64_t ObjectSize = - cast<ConstantInt>(I.getArgOperand(0))->getSExtValue(); const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1)); // First check that the Alloca is static, otherwise it won't have a @@ -7605,7 +7607,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; const int FrameIndex = SI->second; - Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize); + Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex); DAG.setRoot(Res); return; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 9474587..900da76 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -946,8 +946,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { << " -> " << ASC->getDestAddressSpace() << ']'; - } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) { - OS << "<0 to " << LN->getSize() << ">"; } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) { OS << '<' << AA->getAlign().value() << '>'; } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1764910..48d6b99 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9471,7 +9471,7 @@ SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG, ISD::SRL, DL, VT, DAG.getNode(ISD::MUL, DL, VT, DAG.getNode(ISD::AND, DL, VT, Op, Neg), DAG.getConstant(DeBruijn, DL, VT)), - DAG.getConstant(ShiftAmt, DL, VT)); + DAG.getShiftAmountConstant(ShiftAmt, VT, DL)); Lookup = DAG.getSExtOrTrunc(Lookup, DL, getPointerTy(TD)); SmallVector<uint8_t> Table(BitWidth, 0); diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index b79911b..2a8234a 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -588,7 +588,14 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F, continue; Instruction *CheckLoc = dyn_cast<ReturnInst>(BB.getTerminator()); if (!CheckLoc && !DisableCheckNoReturn) - for (auto &Inst : BB) + for (auto &Inst : BB) { + if (IntrinsicInst *IB = dyn_cast<IntrinsicInst>(&Inst); + IB && (IB->getIntrinsicID() == Intrinsic::eh_sjlj_callsite)) { + // eh_sjlj_callsite has to be in same BB as the + // bb terminator. Don't insert within this range. + CheckLoc = IB; + break; + } if (auto *CB = dyn_cast<CallBase>(&Inst)) // Do stack check before noreturn calls that aren't nounwind (e.g: // __cxa_throw). @@ -596,6 +603,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F, CheckLoc = CB; break; } + } if (!CheckLoc) continue; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index d4a3455..3c91b0e 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -806,7 +806,17 @@ void TargetLoweringBase::initActions() { ISD::SDIVFIX, ISD::SDIVFIXSAT, ISD::UDIVFIX, ISD::UDIVFIXSAT, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, - ISD::IS_FPCLASS}, + ISD::IS_FPCLASS, ISD::FCBRT, + ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::FEXP, + ISD::FEXP2, ISD::FEXP10, + ISD::FFLOOR, ISD::FNEARBYINT, + ISD::FCEIL, ISD::FRINT, + ISD::FTRUNC, ISD::FROUNDEVEN, + ISD::FTAN, ISD::FACOS, + ISD::FASIN, ISD::FATAN, + ISD::FCOSH, ISD::FSINH, + ISD::FTANH, ISD::FATAN2}, VT, Expand); // Overflow operations default to expand @@ -852,13 +862,12 @@ void TargetLoweringBase::initActions() { // These operations default to expand for vector types. if (VT.isVector()) - setOperationAction( - {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG, - ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG, - ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND, - ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, - ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2}, - VT, Expand); + setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, + ISD::ANY_EXTEND_VECTOR_INREG, + ISD::SIGN_EXTEND_VECTOR_INREG, + ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR, + ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND}, + VT, Expand); // Constrained floating-point operations default to expand. #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ @@ -914,15 +923,6 @@ void TargetLoweringBase::initActions() { {MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128}, Expand); - // These library functions default to expand. - setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, - ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, - ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, - ISD::FROUNDEVEN, ISD::FTAN, ISD::FACOS, ISD::FASIN, - ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH, - ISD::FATAN2}, - {MVT::f32, MVT::f64, MVT::f128}, Expand); - // Insert custom handling default for llvm.canonicalize.*. setOperationAction(ISD::FCANONICALIZE, {MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand); @@ -2062,7 +2062,7 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const { // FreeBSD has "__stack_chk_guard" defined externally on libc.so if (M.getDirectAccessExternalData() && - !TM.getTargetTriple().isWindowsGNUEnvironment() && + !TM.getTargetTriple().isOSCygMing() && !(TM.getTargetTriple().isPPC64() && TM.getTargetTriple().isOSFreeBSD()) && (!TM.getTargetTriple().isOSDarwin() || diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index a40ceaa..e9172f4 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -995,7 +995,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForLSDA( if (!LSDASection || (!F.hasComdat() && !TM.getFunctionSections())) return LSDASection; - const auto *LSDA = cast<MCSectionELF>(LSDASection); + const auto *LSDA = static_cast<const MCSectionELF *>(LSDASection); unsigned Flags = LSDA->getFlags(); const MCSymbolELF *LinkedToSym = nullptr; StringRef Group; @@ -1060,27 +1060,27 @@ MCSection *TargetLoweringObjectFileELF::getSectionForConstant( auto &Context = getContext(); if (Kind.isMergeableConst4() && MergeableConst4Section) - return Context.getELFSection(".rodata.cst4." + SectionSuffix, + return Context.getELFSection(".rodata.cst4." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 4); if (Kind.isMergeableConst8() && MergeableConst8Section) - return Context.getELFSection(".rodata.cst8." + SectionSuffix, + return Context.getELFSection(".rodata.cst8." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 8); if (Kind.isMergeableConst16() && MergeableConst16Section) - return Context.getELFSection(".rodata.cst16." + SectionSuffix, + return Context.getELFSection(".rodata.cst16." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 16); if (Kind.isMergeableConst32() && MergeableConst32Section) - return Context.getELFSection(".rodata.cst32." + SectionSuffix, + return Context.getELFSection(".rodata.cst32." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 32); if (Kind.isReadOnly()) - return Context.getELFSection(".rodata." + SectionSuffix, ELF::SHT_PROGBITS, - ELF::SHF_ALLOC); + return Context.getELFSection(".rodata." + SectionSuffix + ".", + ELF::SHT_PROGBITS, ELF::SHF_ALLOC); assert(Kind.isReadOnlyWithRel() && "Unknown section kind"); - return Context.getELFSection(".data.rel.ro." + SectionSuffix, + return Context.getELFSection(".data.rel.ro." + SectionSuffix + ".", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE); } @@ -1734,7 +1734,8 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal( Name == getInstrProfSectionName(IPSK_covdata, Triple::COFF, /*AddSegmentInfo=*/false) || Name == getInstrProfSectionName(IPSK_covname, Triple::COFF, - /*AddSegmentInfo=*/false)) + /*AddSegmentInfo=*/false) || + Name == ".llvmbc" || Name == ".llvmcmd") Kind = SectionKind::getMetadata(); int Selection = 0; unsigned Characteristics = getCOFFSectionFlags(Kind, TM); @@ -2054,14 +2055,14 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection( unsigned Priority, const MCSymbol *KeySym) const { return getCOFFStaticStructorSection( getContext(), getContext().getTargetTriple(), true, Priority, KeySym, - cast<MCSectionCOFF>(StaticCtorSection)); + static_cast<MCSectionCOFF *>(StaticCtorSection)); } MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection( unsigned Priority, const MCSymbol *KeySym) const { return getCOFFStaticStructorSection( getContext(), getContext().getTargetTriple(), false, Priority, KeySym, - cast<MCSectionCOFF>(StaticDtorSection)); + static_cast<MCSectionCOFF *>(StaticDtorSection)); } const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference( @@ -2388,23 +2389,25 @@ TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV, // here. if (const GlobalObject *GO = dyn_cast<GlobalObject>(GV)) { if (GO->isDeclarationForLinker()) - return cast<MCSectionXCOFF>(getSectionForExternalReference(GO, TM)) + return static_cast<const MCSectionXCOFF *>( + getSectionForExternalReference(GO, TM)) ->getQualNameSymbol(); if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) if (GVar->hasAttribute("toc-data")) - return cast<MCSectionXCOFF>( + return static_cast<const MCSectionXCOFF *>( SectionForGlobal(GVar, SectionKind::getData(), TM)) ->getQualNameSymbol(); SectionKind GOKind = getKindForGlobal(GO, TM); if (GOKind.isText()) - return cast<MCSectionXCOFF>( + return static_cast<const MCSectionXCOFF *>( getSectionForFunctionDescriptor(cast<Function>(GO), TM)) ->getQualNameSymbol(); if ((TM.getDataSections() && !GO->hasSection()) || GO->hasCommonLinkage() || GOKind.isBSSLocal() || GOKind.isThreadBSSLocal()) - return cast<MCSectionXCOFF>(SectionForGlobal(GO, GOKind, TM)) + return static_cast<const MCSectionXCOFF *>( + SectionForGlobal(GO, GOKind, TM)) ->getQualNameSymbol(); } @@ -2740,7 +2743,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry( MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA( const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const { - auto *LSDA = cast<MCSectionXCOFF>(LSDASection); + auto *LSDA = static_cast<MCSectionXCOFF *>(LSDASection); if (TM.getFunctionSections()) { // If option -ffunction-sections is on, append the function name to the // name of the LSDA csect so that each function has its own LSDA csect. diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp index 6267207..fd54190 100644 --- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp +++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp @@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable( AddrOfOldGV, Twine("__ref_").concat(GV->getName()), nullptr, GlobalVariable::NotThreadLocal); + // RefGV is created with isConstant = false, but we want to place RefGV into + // .rdata, not .data. It is important that the GlobalVariable be mutable + // from the compiler's point of view, so that the optimizer does not remove + // the global variable entirely and replace all references to it with its + // initial value. + // + // When the Windows hot-patch loader applies a hot-patch, it maps the + // pages of .rdata as read/write so that it can set each __ref_* variable + // to point to the original variable in the base image. Afterward, pages in + // .rdata are remapped as read-only. This protects the __ref_* variables from + // being overwritten during execution. + RefGV->setSection(".rdata"); + // Create debug info for the replacement global variable. DataLayout Layout = M->getDataLayout(); DIType *DebugType = DebugInfo.createPointerType( diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index b3798f1..a8559e7 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -183,7 +183,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { std::lock_guard<sys::Mutex> locked(lock); // Save information about our target - Arch = (Triple::ArchType)Obj.getArch(); + Arch = Obj.getArch(); IsTargetLittleEndian = Obj.isLittleEndian(); setMipsABI(Obj); @@ -1361,18 +1361,17 @@ std::unique_ptr<RuntimeDyld::LoadedObjectInfo> RuntimeDyld::loadObject(const ObjectFile &Obj) { if (!Dyld) { if (Obj.isELF()) - Dyld = - createRuntimeDyldELF(static_cast<Triple::ArchType>(Obj.getArch()), - MemMgr, Resolver, ProcessAllSections, - std::move(NotifyStubEmitted)); + Dyld = createRuntimeDyldELF(Obj.getArch(), MemMgr, Resolver, + ProcessAllSections, + std::move(NotifyStubEmitted)); else if (Obj.isMachO()) - Dyld = createRuntimeDyldMachO( - static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver, - ProcessAllSections, std::move(NotifyStubEmitted)); + Dyld = createRuntimeDyldMachO(Obj.getArch(), MemMgr, Resolver, + ProcessAllSections, + std::move(NotifyStubEmitted)); else if (Obj.isCOFF()) - Dyld = createRuntimeDyldCOFF( - static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver, - ProcessAllSections, std::move(NotifyStubEmitted)); + Dyld = createRuntimeDyldCOFF(Obj.getArch(), MemMgr, Resolver, + ProcessAllSections, + std::move(NotifyStubEmitted)); else report_fatal_error("Incompatible object format!"); } diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h index bd0d72f..0e95369 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h @@ -157,8 +157,7 @@ private: processSubtractRelocation(unsigned SectionID, relocation_iterator RelI, const MachOObjectFile &BaseObj, ObjSectionToIDMap &ObjSectionToID) { - const MachOObjectFile &Obj = - static_cast<const MachOObjectFile&>(BaseObj); + const MachOObjectFile &Obj = BaseObj; MachO::any_relocation_info RE = Obj.getRelocation(RelI->getRawDataRefImpl()); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 840ca83..3aa4f7a 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1161,7 +1161,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel( Builder.restoreIP(AllocaIP); auto *KernelArgsPtr = Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args"); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) { llvm::Value *Arg = @@ -1189,7 +1189,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch( if (!updateToLocation(Loc)) return Loc.IP; - Builder.restoreIP(Loc.IP); // On top of the arrays that were filled up, the target offloading call // takes as arguments the device id as well as the host pointer. The host // pointer is used by the runtime library to identify the current target @@ -2617,7 +2616,7 @@ void OpenMPIRBuilder::emitReductionListCopy( Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos, AttributeList FuncAttrs) { - IRBuilder<>::InsertPointGuard IPG(Builder); + InsertPointTy SavedIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()}, @@ -2630,7 +2629,6 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( WcFunc->addParamAttr(1, Attribute::NoUndef); BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc); Builder.SetInsertPoint(EntryBB); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // ReduceList: thread local Reduce list. // At the stage of the computation when this function is called, partially @@ -2845,6 +2843,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction( } Builder.CreateRetVoid(); + Builder.restoreIP(SavedIP); return WcFunc; } @@ -2853,7 +2852,6 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, AttributeList FuncAttrs) { LLVMContext &Ctx = M.getContext(); - IRBuilder<>::InsertPointGuard IPG(Builder); FunctionType *FuncTy = FunctionType::get(Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt16Ty(), @@ -2872,7 +2870,6 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( SarFunc->addParamAttr(3, Attribute::SExt); BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc); Builder.SetInsertPoint(EntryBB); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Thread local Reduce list used to host the values of data to be reduced. Argument *ReduceListArg = SarFunc->getArg(0); @@ -3019,7 +3016,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - IRBuilder<>::InsertPointGuard IPG(Builder); + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3035,7 +3032,6 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); Builder.SetInsertPoint(EntryBlock); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGCFunc->getArg(0); @@ -3123,13 +3119,14 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( } Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); return LtGCFunc; } Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - IRBuilder<>::InsertPointGuard IPG(Builder); + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3145,7 +3142,6 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); Builder.SetInsertPoint(EntryBlock); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGRFunc->getArg(0); @@ -3206,13 +3202,14 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList}) ->addFnAttr(Attribute::NoUnwind); Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); return LtGRFunc; } Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - IRBuilder<>::InsertPointGuard IPG(Builder); + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3228,7 +3225,6 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); Builder.SetInsertPoint(EntryBlock); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGCFunc->getArg(0); @@ -3314,13 +3310,14 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( } Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); return LtGCFunc; } Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - IRBuilder<>::InsertPointGuard IPG(Builder); + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); auto *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3336,7 +3333,6 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); Builder.SetInsertPoint(EntryBlock); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGRFunc->getArg(0); @@ -3397,6 +3393,7 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( Builder.CreateCall(ReduceFn, {ReduceList, ReductionList}) ->addFnAttr(Attribute::NoUnwind); Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); return LtGRFunc; } @@ -3409,7 +3406,6 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { Expected<Function *> OpenMPIRBuilder::createReductionFunction( StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) { - IRBuilder<>::InsertPointGuard IPG(Builder); auto *FuncTy = FunctionType::get(Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, /* IsVarArg */ false); @@ -3422,7 +3418,6 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction( BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", ReductionFunc); Builder.SetInsertPoint(EntryBB); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Need to alloca memory here and deal with the pointers before getting // LHS/RHS pointers out @@ -3750,12 +3745,10 @@ static Error populateReductionFunction( Function *ReductionFunc, ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) { - IRBuilder<>::InsertPointGuard IPG(Builder); Module *Module = ReductionFunc->getParent(); BasicBlock *ReductionFuncBlock = BasicBlock::Create(Module->getContext(), "", ReductionFunc); Builder.SetInsertPoint(ReductionFuncBlock); - Builder.SetCurrentDebugLocation(llvm::DebugLoc()); Value *LHSArrayPtr = nullptr; Value *RHSArrayPtr = nullptr; if (IsGPU) { @@ -5961,7 +5954,7 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, Builder.restoreIP(AllocaIP); AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name); ArgsBase->setAlignment(Align(8)); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); // Store the index value with offset in depend vector. for (unsigned I = 0; I < NumLoops; ++I) { @@ -8087,7 +8080,7 @@ void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc, ".offload_ptrs"); AllocaInst *ArgSizes = Builder.CreateAlloca( ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes"); - Builder.restoreIP(Loc.IP); + updateToLocation(Loc); MapperAllocas.ArgsBase = ArgsBase; MapperAllocas.Args = Args; MapperAllocas.ArgSizes = ArgSizes; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 28ed1e5..7159107 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1450,6 +1450,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Case("popc.ll", true) .Case("h2f", true) .Case("swap.lo.hi.b64", true) + .Case("tanh.approx.f32", true) .Default(false); if (Expand) { @@ -2543,6 +2544,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, MDNode *MD = MDNode::get(Builder.getContext(), {}); LD->setMetadata(LLVMContext::MD_invariant_load, MD); return LD; + } else if (Name == "tanh.approx.f32") { + // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32 + FastMathFlags FMF; + FMF.setApproxFunc(); + Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0), + FMF); } else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") { Value *Arg = Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0); diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp index b94dcac..4f37624 100644 --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -81,6 +81,10 @@ void DiagnosticInfoInlineAsm::print(DiagnosticPrinter &DP) const { DP << " at line " << getLocCookie(); } +void DiagnosticInfoLegalizationFailure::print(DiagnosticPrinter &DP) const { + DP << getLocationStr() << ": " << getMsgStr(); +} + DiagnosticInfoRegAllocFailure::DiagnosticInfoRegAllocFailure( const Twine &MsgStr, const Function &Fn, const DiagnosticLocation &DL, DiagnosticSeverity Severity) diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 28037d7..49c6dc7 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -1144,9 +1144,32 @@ Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V, return CreateShuffleVector(V, Zeros, Name + ".splat"); } -Value *IRBuilderBase::CreatePreserveArrayAccessIndex( - Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex, - MDNode *DbgInfo) { +Value *IRBuilderBase::CreateVectorInterleave(ArrayRef<Value *> Ops, + const Twine &Name) { + assert(Ops.size() >= 2 && Ops.size() <= 8 && + "Unexpected number of operands to interleave"); + + // Make sure all operands are the same type. + assert(isa<VectorType>(Ops[0]->getType()) && "Unexpected type"); + +#ifndef NDEBUG + for (unsigned I = 1; I < Ops.size(); I++) { + assert(Ops[I]->getType() == Ops[0]->getType() && + "Vector interleave expects matching operand types!"); + } +#endif + + unsigned IID = Intrinsic::getInterleaveIntrinsicID(Ops.size()); + auto *SubvecTy = cast<VectorType>(Ops[0]->getType()); + Type *DestTy = VectorType::get(SubvecTy->getElementType(), + SubvecTy->getElementCount() * Ops.size()); + return CreateIntrinsic(IID, {DestTy}, Ops, {}, Name); +} + +Value *IRBuilderBase::CreatePreserveArrayAccessIndex(Type *ElTy, Value *Base, + unsigned Dimension, + unsigned LastIndex, + MDNode *DbgInfo) { auto *BaseType = Base->getType(); assert(isa<PointerType>(BaseType) && "Invalid Base ptr type for preserve.array.access.index."); diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index 6c35ade..58a1f74 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -1133,3 +1133,27 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) { "Shouldn't change the signature"); return NewDecl; } + +struct InterleaveIntrinsic { + Intrinsic::ID Interleave, Deinterleave; +}; + +static InterleaveIntrinsic InterleaveIntrinsics[] = { + {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2}, + {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3}, + {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4}, + {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5}, + {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6}, + {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7}, + {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8}, +}; + +Intrinsic::ID Intrinsic::getInterleaveIntrinsicID(unsigned Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + return InterleaveIntrinsics[Factor - 2].Interleave; +} + +Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + return InterleaveIntrinsics[Factor - 2].Deinterleave; +} diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 0dbd07f..1157cbe 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const { Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct); Result.Scope = Info.lookup(LLVMContext::MD_alias_scope); Result.NoAlias = Info.lookup(LLVMContext::MD_noalias); + Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace); } return Result; } @@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) { setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct); setMetadata(LLVMContext::MD_alias_scope, N.Scope); setMetadata(LLVMContext::MD_noalias, N.NoAlias); + setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace); } void Instruction::setNoSanitizeMetadata() { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 73e79c0..0323b4d 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/LTO/LTO.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StableHashing.h" @@ -742,18 +743,19 @@ Error LTO::add(std::unique_ptr<InputFile> Input, Conf.VisibilityScheme = Config::ELF; } - const SymbolResolution *ResI = Res.begin(); - for (unsigned I = 0; I != Input->Mods.size(); ++I) - if (Error Err = addModule(*Input, I, ResI, Res.end())) + ArrayRef<SymbolResolution> InputRes = Res; + for (unsigned I = 0; I != Input->Mods.size(); ++I) { + if (auto Err = addModule(*Input, InputRes, I, Res).moveInto(Res)) return Err; + } - assert(ResI == Res.end()); + assert(Res.empty()); return Error::success(); } -Error LTO::addModule(InputFile &Input, unsigned ModI, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { +Expected<ArrayRef<SymbolResolution>> +LTO::addModule(InputFile &Input, ArrayRef<SymbolResolution> InputRes, + unsigned ModI, ArrayRef<SymbolResolution> Res) { Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo(); if (!LTOInfo) return LTOInfo.takeError(); @@ -782,28 +784,32 @@ Error LTO::addModule(InputFile &Input, unsigned ModI, bool IsThinLTO = LTOInfo->IsThinLTO && (LTOMode != LTOK_UnifiedRegular); auto ModSyms = Input.module_symbols(ModI); - addModuleToGlobalRes(ModSyms, {ResI, ResE}, + addModuleToGlobalRes(ModSyms, Res, IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0, LTOInfo->HasSummary); if (IsThinLTO) - return addThinLTO(BM, ModSyms, ResI, ResE); + return addThinLTO(BM, ModSyms, Res); RegularLTO.EmptyCombinedModule = false; - Expected<RegularLTOState::AddedModule> ModOrErr = - addRegularLTO(BM, ModSyms, ResI, ResE); + auto ModOrErr = addRegularLTO(Input, InputRes, BM, ModSyms, Res); if (!ModOrErr) return ModOrErr.takeError(); + Res = ModOrErr->second; - if (!LTOInfo->HasSummary) - return linkRegularLTO(std::move(*ModOrErr), /*LivenessFromIndex=*/false); + if (!LTOInfo->HasSummary) { + if (Error Err = linkRegularLTO(std::move(ModOrErr->first), + /*LivenessFromIndex=*/false)) + return Err; + return Res; + } // Regular LTO module summaries are added to a dummy module that represents // the combined regular LTO module. if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, "")) return Err; - RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr)); - return Error::success(); + RegularLTO.ModsWithSummaries.push_back(std::move(ModOrErr->first)); + return Res; } // Checks whether the given global value is in a non-prevailing comdat @@ -839,10 +845,11 @@ handleNonPrevailingComdat(GlobalValue &GV, // Add a regular LTO object to the link. // The resulting module needs to be linked into the combined LTO module with // linkRegularLTO. -Expected<LTO::RegularLTOState::AddedModule> -LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { +Expected< + std::pair<LTO::RegularLTOState::AddedModule, ArrayRef<SymbolResolution>>> +LTO::addRegularLTO(InputFile &Input, ArrayRef<SymbolResolution> InputRes, + BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, + ArrayRef<SymbolResolution> Res) { RegularLTOState::AddedModule Mod; Expected<std::unique_ptr<Module>> MOrErr = BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true, @@ -855,13 +862,34 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, if (Error Err = M.materializeMetadata()) return std::move(Err); - // If cfi.functions is present and we are in regular LTO mode, LowerTypeTests - // will rename local functions in the merged module as "<function name>.1". - // This causes linking errors, since other parts of the module expect the - // original function name. - if (LTOMode == LTOK_UnifiedRegular) + if (LTOMode == LTOK_UnifiedRegular) { + // cfi.functions metadata is intended to be used with ThinLTO and may + // trigger invalid IR transformations if they are present when doing regular + // LTO, so delete it. if (NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions")) M.eraseNamedMetadata(CfiFunctionsMD); + } else if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) { + // Delete aliases entries for non-prevailing symbols on the ThinLTO side of + // this input file. + DenseSet<StringRef> Prevailing; + for (auto [I, R] : zip(Input.symbols(), InputRes)) + if (R.Prevailing && !I.getIRName().empty()) + Prevailing.insert(I.getIRName()); + std::vector<MDNode *> AliasGroups; + for (MDNode *AliasGroup : AliasesMD->operands()) { + std::vector<Metadata *> Aliases; + for (Metadata *Alias : AliasGroup->operands()) { + if (isa<MDString>(Alias) && + Prevailing.count(cast<MDString>(Alias)->getString())) + Aliases.push_back(Alias); + } + if (Aliases.size() > 1) + AliasGroups.push_back(MDTuple::get(RegularLTO.Ctx, Aliases)); + } + AliasesMD->clearOperands(); + for (MDNode *G : AliasGroups) + AliasesMD->addOperand(G); + } UpgradeDebugInfo(M); @@ -899,22 +927,22 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, std::set<const Comdat *> NonPrevailingComdats; SmallSet<StringRef, 2> NonPrevailingAsmSymbols; for (const InputFile::Symbol &Sym : Syms) { - assert(ResI != ResE); - SymbolResolution Res = *ResI++; + assert(!Res.empty()); + const SymbolResolution &R = Res.consume_front(); assert(MsymI != MsymE); ModuleSymbolTable::Symbol Msym = *MsymI++; Skip(); if (GlobalValue *GV = dyn_cast_if_present<GlobalValue *>(Msym)) { - if (Res.Prevailing) { + if (R.Prevailing) { if (Sym.isUndefined()) continue; Mod.Keep.push_back(GV); // For symbols re-defined with linker -wrap and -defsym options, // set the linkage to weak to inhibit IPO. The linkage will be // restored by the linker. - if (Res.LinkerRedefined) + if (R.LinkerRedefined) GV->setLinkage(GlobalValue::WeakAnyLinkage); GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage(); @@ -938,7 +966,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } // Set the 'local' flag based on the linker resolution for this symbol. - if (Res.FinalDefinitionInLinkageUnit) { + if (R.FinalDefinitionInLinkageUnit) { GV->setDSOLocal(true); if (GV->hasDLLImportStorageClass()) GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes:: @@ -947,7 +975,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } else if (auto *AS = dyn_cast_if_present<ModuleSymbolTable::AsmSymbol *>(Msym)) { // Collect non-prevailing symbols. - if (!Res.Prevailing) + if (!R.Prevailing) NonPrevailingAsmSymbols.insert(AS->first); } else { llvm_unreachable("unknown symbol type"); @@ -965,7 +993,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, CommonRes.Alignment = std::max(Align(SymAlignValue), CommonRes.Alignment); } - CommonRes.Prevailing |= Res.Prevailing; + CommonRes.Prevailing |= R.Prevailing; } } @@ -991,7 +1019,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } assert(MsymI == MsymE); - return std::move(Mod); + return std::make_pair(std::move(Mod), Res); } Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, @@ -1032,19 +1060,19 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod, } // Add a ThinLTO module to the link. -Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, - const SymbolResolution *&ResI, - const SymbolResolution *ResE) { - const SymbolResolution *ResITmp = ResI; +Expected<ArrayRef<SymbolResolution>> +LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, + ArrayRef<SymbolResolution> Res) { + ArrayRef<SymbolResolution> ResTmp = Res; for (const InputFile::Symbol &Sym : Syms) { - assert(ResITmp != ResE); - SymbolResolution Res = *ResITmp++; + assert(!ResTmp.empty()); + const SymbolResolution &R = ResTmp.consume_front(); if (!Sym.getIRName().empty()) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) + if (R.Prevailing) ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); } } @@ -1059,14 +1087,14 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); for (const InputFile::Symbol &Sym : Syms) { - assert(ResI != ResE); - SymbolResolution Res = *ResI++; + assert(!Res.empty()); + const SymbolResolution &R = Res.consume_front(); if (!Sym.getIRName().empty()) { auto GUID = GlobalValue::getGUIDAssumingExternalLinkage( GlobalValue::getGlobalIdentifier(Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) { + if (R.Prevailing) { assert(ThinLTO.PrevailingModuleForGUID[GUID] == BM.getModuleIdentifier()); @@ -1074,7 +1102,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // switch the linkage to `weak` to prevent IPOs from happening. // Find the summary in the module for this very GV and record the new // linkage so that we can switch it when we import the GV. - if (Res.LinkerRedefined) + if (R.LinkerRedefined) if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); @@ -1082,7 +1110,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, // If the linker resolved the symbol to a local definition then mark it // as local in the summary for the module we are adding. - if (Res.FinalDefinitionInLinkageUnit) { + if (R.FinalDefinitionInLinkageUnit) { if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( GUID, BM.getModuleIdentifier())) { S->setDSOLocal(true); @@ -1110,7 +1138,7 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms, } } - return Error::success(); + return Res; } unsigned LTO::getMaxTasks() const { diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index d662c42..18a85b3 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -43,13 +43,7 @@ add_llvm_component_library(LLVMMC MCRegisterInfo.cpp MCSchedule.cpp MCSection.cpp - MCSectionCOFF.cpp - MCSectionDXContainer.cpp - MCSectionELF.cpp - MCSectionGOFF.cpp MCSectionMachO.cpp - MCSectionWasm.cpp - MCSectionXCOFF.cpp MCStreamer.cpp MCSPIRVStreamer.cpp MCSubtargetInfo.cpp diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 9f52b3e..ae8dffc 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -559,20 +559,7 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) { } else { const MCSectionELF &Section = static_cast<const MCSectionELF &>(Symbol.getSection()); - - // We may end up with a situation when section symbol is technically - // defined, but should not be. That happens because we explicitly - // pre-create few .debug_* sections to have accessors. - // And if these sections were not really defined in the code, but were - // referenced, we simply error out. - if (!Section.isRegistered()) { - assert(static_cast<const MCSymbolELF &>(Symbol).getType() == - ELF::STT_SECTION); - Ctx.reportError(SMLoc(), - "Undefined section reference: " + Symbol.getName()); - continue; - } - + assert(Section.isRegistered()); if (Mode == NonDwoOnly && isDwoSection(Section)) continue; MSD.SectionIndex = Section.getOrdinal(); @@ -1100,7 +1087,8 @@ uint64_t ELFWriter::writeObject() { // Remember the offset into the file for this section. const uint64_t SecStart = align(RelSection->getAlign()); - writeRelocations(cast<MCSectionELF>(*RelSection->getLinkedToSection())); + writeRelocations( + static_cast<const MCSectionELF &>(*RelSection->getLinkedToSection())); uint64_t SecEnd = W.OS.tell(); RelSection->setOffsets(SecStart, SecEnd); @@ -1273,7 +1261,7 @@ bool ELFObjectWriter::useSectionSymbol(const MCValue &Val, // that it pointed to another string and subtracting 42 at runtime will // produce the wrong value. if (Sym->isInSection()) { - auto &Sec = cast<MCSectionELF>(Sym->getSection()); + auto &Sec = static_cast<const MCSectionELF &>(Sym->getSection()); unsigned Flags = Sec.getFlags(); if (Flags & ELF::SHF_MERGE) { if (C != 0) @@ -1325,13 +1313,14 @@ bool ELFObjectWriter::checkRelocation(SMLoc Loc, const MCSectionELF *From, void ELFObjectWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) { - const MCSectionELF &Section = cast<MCSectionELF>(*F.getParent()); + auto &Section = static_cast<const MCSectionELF &>(*F.getParent()); MCContext &Ctx = getContext(); const auto *SymA = cast_or_null<MCSymbolELF>(Target.getAddSym()); - const MCSectionELF *SecA = (SymA && SymA->isInSection()) - ? cast<MCSectionELF>(&SymA->getSection()) - : nullptr; + const MCSectionELF *SecA = + (SymA && SymA->isInSection()) + ? static_cast<const MCSectionELF *>(&SymA->getSection()) + : nullptr; if (DwoOS && !checkRelocation(Fixup.getLoc(), &Section, SecA)) return; diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 1871f5f..88188f3 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -336,7 +336,7 @@ void GOFFWriter::defineSymbols() { unsigned Ordinal = 0; // Process all sections. for (MCSection &S : Asm) { - auto &Section = cast<MCSectionGOFF>(S); + auto &Section = static_cast<MCSectionGOFF &>(S); Section.setOrdinal(++Ordinal); defineSectionSymbols(Section); } diff --git a/llvm/lib/MC/MCAsmInfoCOFF.cpp b/llvm/lib/MC/MCAsmInfoCOFF.cpp index 0b8781c..54717df 100644 --- a/llvm/lib/MC/MCAsmInfoCOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoCOFF.cpp @@ -12,7 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoCOFF.h" +#include "llvm/BinaryFormat/COFF.h" #include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> using namespace llvm; @@ -49,6 +55,10 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() { HasCOFFComdatConstants = true; } +bool MCAsmInfoCOFF::useCodeAlign(const MCSection &Sec) const { + return Sec.isText(); +} + void MCAsmInfoMicrosoft::anchor() {} MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default; @@ -64,3 +74,101 @@ MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() { // We don't create constants in comdat sections for MinGW. HasCOFFComdatConstants = false; } + +bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name) const { + if (COMDATSymbol || isUnique()) + return false; + + // FIXME: Does .section .bss/.data/.text work everywhere?? + if (Name == ".text" || Name == ".data" || Name == ".bss") + return true; + + return false; +} + +void MCSectionCOFF::setSelection(int Selection) const { + assert(Selection != 0 && "invalid COMDAT selection type"); + this->Selection = Selection; + Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; +} + +void MCAsmInfoCOFF::printSwitchToSection(const MCSection &Section, uint32_t, + const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionCOFF &>(Section); + // standard sections don't require the '.section' + if (Sec.shouldOmitSectionDirective(Sec.getName())) { + OS << '\t' << Sec.getName() << '\n'; + return; + } + + OS << "\t.section\t" << Sec.getName() << ",\""; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA) + OS << 'd'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) + OS << 'b'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE) + OS << 'x'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE) + OS << 'w'; + else if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_READ) + OS << 'r'; + else + OS << 'y'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE) + OS << 'n'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED) + OS << 's'; + if ((Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) && + !Sec.isImplicitlyDiscardable(Sec.getName())) + OS << 'D'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO) + OS << 'i'; + OS << '"'; + + // unique should be tail of .section directive. + if (Sec.isUnique() && !Sec.COMDATSymbol) + OS << ",unique," << Sec.UniqueID; + + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) { + if (Sec.COMDATSymbol) + OS << ","; + else + OS << "\n\t.linkonce\t"; + switch (Sec.Selection) { + case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES: + OS << "one_only"; + break; + case COFF::IMAGE_COMDAT_SELECT_ANY: + OS << "discard"; + break; + case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE: + OS << "same_size"; + break; + case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH: + OS << "same_contents"; + break; + case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE: + OS << "associative"; + break; + case COFF::IMAGE_COMDAT_SELECT_LARGEST: + OS << "largest"; + break; + case COFF::IMAGE_COMDAT_SELECT_NEWEST: + OS << "newest"; + break; + default: + assert(false && "unsupported COFF selection type"); + break; + } + if (Sec.COMDATSymbol) { + OS << ","; + Sec.COMDATSymbol->print(OS, this); + } + } + + if (Sec.isUnique() && Sec.COMDATSymbol) + OS << ",unique," << Sec.UniqueID; + + OS << '\n'; +} diff --git a/llvm/lib/MC/MCAsmInfoDarwin.cpp b/llvm/lib/MC/MCAsmInfoDarwin.cpp index 9cba775..e156fa0 100644 --- a/llvm/lib/MC/MCAsmInfoDarwin.cpp +++ b/llvm/lib/MC/MCAsmInfoDarwin.cpp @@ -85,3 +85,8 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() { DwarfUsesRelocationsAcrossSections = false; SetDirectiveSuppressesReloc = true; } + +bool MCAsmInfoDarwin::useCodeAlign(const MCSection &Sec) const { + return static_cast<const MCSectionMachO &>(Sec).hasAttribute( + MachO::S_ATTR_PURE_INSTRUCTIONS); +} diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp index 7eb89ef..cdae9d7 100644 --- a/llvm/lib/MC/MCAsmInfoELF.cpp +++ b/llvm/lib/MC/MCAsmInfoELF.cpp @@ -12,9 +12,16 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoELF.h" +#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/Triple.h" +#include <cassert> using namespace llvm; @@ -28,9 +35,198 @@ MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const { return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0); } +bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const { + return static_cast<const MCSectionELF &>(Sec).getFlags() & ELF::SHF_EXECINSTR; +} + MCAsmInfoELF::MCAsmInfoELF() { HasIdentDirective = true; WeakRefDirective = "\t.weak\t"; PrivateGlobalPrefix = ".L"; PrivateLabelPrefix = ".L"; } + +static void printName(raw_ostream &OS, StringRef Name) { + if (Name.find_first_not_of("0123456789_." + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { + OS << Name; + return; + } + OS << '"'; + for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { + if (*B == '"') // Unquoted " + OS << "\\\""; + else if (*B != '\\') // Neither " or backslash + OS << *B; + else if (B + 1 == E) // Trailing backslash + OS << "\\\\"; + else { + OS << B[0] << B[1]; // Quoted character + ++B; + } + } + OS << '"'; +} + +void MCAsmInfoELF::printSwitchToSection(const MCSection &Section, + uint32_t Subsection, const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionELF &>(Section); + if (!Sec.isUnique() && shouldOmitSectionDirective(Sec.getName())) { + OS << '\t' << Sec.getName(); + if (Subsection) + OS << '\t' << Subsection; + OS << '\n'; + return; + } + + OS << "\t.section\t"; + printName(OS, Sec.getName()); + + // Handle the weird solaris syntax if desired. + if (usesSunStyleELFSectionSwitchSyntax() && !(Sec.Flags & ELF::SHF_MERGE)) { + if (Sec.Flags & ELF::SHF_ALLOC) + OS << ",#alloc"; + if (Sec.Flags & ELF::SHF_EXECINSTR) + OS << ",#execinstr"; + if (Sec.Flags & ELF::SHF_WRITE) + OS << ",#write"; + if (Sec.Flags & ELF::SHF_EXCLUDE) + OS << ",#exclude"; + if (Sec.Flags & ELF::SHF_TLS) + OS << ",#tls"; + OS << '\n'; + return; + } + + OS << ",\""; + if (Sec.Flags & ELF::SHF_ALLOC) + OS << 'a'; + if (Sec.Flags & ELF::SHF_EXCLUDE) + OS << 'e'; + if (Sec.Flags & ELF::SHF_EXECINSTR) + OS << 'x'; + if (Sec.Flags & ELF::SHF_WRITE) + OS << 'w'; + if (Sec.Flags & ELF::SHF_MERGE) + OS << 'M'; + if (Sec.Flags & ELF::SHF_STRINGS) + OS << 'S'; + if (Sec.Flags & ELF::SHF_TLS) + OS << 'T'; + if (Sec.Flags & ELF::SHF_LINK_ORDER) + OS << 'o'; + if (Sec.Flags & ELF::SHF_GROUP) + OS << 'G'; + if (Sec.Flags & ELF::SHF_GNU_RETAIN) + OS << 'R'; + + // If there are os-specific flags, print them. + if (T.isOSSolaris()) + if (Sec.Flags & ELF::SHF_SUNW_NODISCARD) + OS << 'R'; + + // If there are tarSec.get-specific flags, print them. + Triple::ArchType Arch = T.getArch(); + if (Arch == Triple::xcore) { + if (Sec.Flags & ELF::XCORE_SHF_CP_SECTION) + OS << 'c'; + if (Sec.Flags & ELF::XCORE_SHF_DP_SECTION) + OS << 'd'; + } else if (T.isARM() || T.isThumb()) { + if (Sec.Flags & ELF::SHF_ARM_PURECODE) + OS << 'y'; + } else if (T.isAArch64()) { + if (Sec.Flags & ELF::SHF_AARCH64_PURECODE) + OS << 'y'; + } else if (Arch == Triple::hexagon) { + if (Sec.Flags & ELF::SHF_HEX_GPREL) + OS << 's'; + } else if (Arch == Triple::x86_64) { + if (Sec.Flags & ELF::SHF_X86_64_LARGE) + OS << 'l'; + } + + OS << '"'; + + OS << ','; + + // If comment string is '@', e.g. as on ARM - use '%' instead + if (getCommentString()[0] == '@') + OS << '%'; + else + OS << '@'; + + if (Sec.Type == ELF::SHT_INIT_ARRAY) + OS << "init_array"; + else if (Sec.Type == ELF::SHT_FINI_ARRAY) + OS << "fini_array"; + else if (Sec.Type == ELF::SHT_PREINIT_ARRAY) + OS << "preinit_array"; + else if (Sec.Type == ELF::SHT_NOBITS) + OS << "nobits"; + else if (Sec.Type == ELF::SHT_NOTE) + OS << "note"; + else if (Sec.Type == ELF::SHT_PROGBITS) + OS << "progbits"; + else if (Sec.Type == ELF::SHT_X86_64_UNWIND) + OS << "unwind"; + else if (Sec.Type == ELF::SHT_MIPS_DWARF) + // Print hex value of the flag while we do not have + // any standard symbolic representation of the flag. + OS << "0x7000001e"; + else if (Sec.Type == ELF::SHT_LLVM_ODRTAB) + OS << "llvm_odrtab"; + else if (Sec.Type == ELF::SHT_LLVM_LINKER_OPTIONS) + OS << "llvm_linker_options"; + else if (Sec.Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE) + OS << "llvm_call_graph_profile"; + else if (Sec.Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES) + OS << "llvm_dependent_libraries"; + else if (Sec.Type == ELF::SHT_LLVM_SYMPART) + OS << "llvm_sympart"; + else if (Sec.Type == ELF::SHT_LLVM_BB_ADDR_MAP) + OS << "llvm_bb_addr_map"; + else if (Sec.Type == ELF::SHT_LLVM_OFFLOADING) + OS << "llvm_offloading"; + else if (Sec.Type == ELF::SHT_LLVM_LTO) + OS << "llvm_lto"; + else if (Sec.Type == ELF::SHT_LLVM_JT_SIZES) + OS << "llvm_jt_sizes"; + else if (Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE) + OS << "llvm_cfi_jump_table"; + else + OS << "0x" << Twine::utohexstr(Sec.Type); + + if (Sec.EntrySize) { + assert((Sec.Flags & ELF::SHF_MERGE) || + Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE); + OS << "," << Sec.EntrySize; + } + + if (Sec.Flags & ELF::SHF_LINK_ORDER) { + OS << ","; + if (Sec.LinkedToSym) + printName(OS, Sec.LinkedToSym->getName()); + else + OS << '0'; + } + + if (Sec.Flags & ELF::SHF_GROUP) { + OS << ","; + printName(OS, Sec.Group.getPointer()->getName()); + if (Sec.isComdat()) + OS << ",comdat"; + } + + if (Sec.isUnique()) + OS << ",unique," << Sec.UniqueID; + + OS << '\n'; + + if (Subsection) { + OS << "\t.subsection\t" << Subsection; + OS << '\n'; + } +} diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp index 3c81a46..0a5d1927 100644 --- a/llvm/lib/MC/MCAsmInfoGOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp @@ -13,11 +13,12 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoGOFF.h" +#include "llvm/BinaryFormat/GOFF.h" +#include "llvm/MC/MCSectionGOFF.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; -void MCAsmInfoGOFF::anchor() {} - MCAsmInfoGOFF::MCAsmInfoGOFF() { Data64bitsDirective = "\t.quad\t"; HasDotTypeDotSizeDirective = false; @@ -25,3 +26,136 @@ MCAsmInfoGOFF::MCAsmInfoGOFF() { PrivateLabelPrefix = "L#"; ZeroDirective = "\t.space\t"; } + +static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode, + GOFF::ESDAlignment Alignment, + GOFF::ESDLoadingBehavior LoadBehavior, + GOFF::ESDExecutable Executable, bool IsReadOnly, + uint32_t SortKey, uint8_t FillByteValue, + StringRef PartName) { + OS << Name << " CATTR "; + OS << "ALIGN(" << static_cast<unsigned>(Alignment) << ")," + << "FILL(" << static_cast<unsigned>(FillByteValue) << ")"; + switch (LoadBehavior) { + case GOFF::ESD_LB_Deferred: + OS << ",DEFLOAD"; + break; + case GOFF::ESD_LB_NoLoad: + OS << ",NOLOAD"; + break; + default: + break; + } + switch (Executable) { + case GOFF::ESD_EXE_CODE: + OS << ",EXECUTABLE"; + break; + case GOFF::ESD_EXE_DATA: + OS << ",NOTEXECUTABLE"; + break; + default: + break; + } + if (IsReadOnly) + OS << ",READONLY"; + if (Rmode != GOFF::ESD_RMODE_None) { + OS << ','; + OS << "RMODE("; + switch (Rmode) { + case GOFF::ESD_RMODE_24: + OS << "24"; + break; + case GOFF::ESD_RMODE_31: + OS << "31"; + break; + case GOFF::ESD_RMODE_64: + OS << "64"; + break; + case GOFF::ESD_RMODE_None: + break; + } + OS << ')'; + } + if (SortKey) + OS << ",PRIORITY(" << SortKey << ")"; + if (!PartName.empty()) + OS << ",PART(" << PartName << ")"; + OS << '\n'; +} + +static void emitXATTR(raw_ostream &OS, StringRef Name, + GOFF::ESDLinkageType Linkage, + GOFF::ESDExecutable Executable, + GOFF::ESDBindingScope BindingScope) { + OS << Name << " XATTR "; + OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),"; + if (Executable != GOFF::ESD_EXE_Unspecified) + OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA") + << "),"; + if (BindingScope != GOFF::ESD_BSC_Unspecified) { + OS << "SCOPE("; + switch (BindingScope) { + case GOFF::ESD_BSC_Section: + OS << "SECTION"; + break; + case GOFF::ESD_BSC_Module: + OS << "MODULE"; + break; + case GOFF::ESD_BSC_Library: + OS << "LIBRARY"; + break; + case GOFF::ESD_BSC_ImportExport: + OS << "EXPORT"; + break; + default: + break; + } + OS << ')'; + } + OS << '\n'; +} + +void MCAsmInfoGOFF::printSwitchToSection(const MCSection &Section, + uint32_t Subsection, const Triple &T, + raw_ostream &OS) const { + auto &Sec = + const_cast<MCSectionGOFF &>(static_cast<const MCSectionGOFF &>(Section)); + switch (Sec.SymbolType) { + case GOFF::ESD_ST_SectionDefinition: { + OS << Sec.getName() << " CSECT\n"; + Sec.Emitted = true; + break; + } + case GOFF::ESD_ST_ElementDefinition: { + printSwitchToSection(*Sec.getParent(), Subsection, T, OS); + if (!Sec.Emitted) { + emitCATTR(OS, Sec.getName(), Sec.EDAttributes.Rmode, + Sec.EDAttributes.Alignment, Sec.EDAttributes.LoadBehavior, + GOFF::ESD_EXE_Unspecified, Sec.EDAttributes.IsReadOnly, 0, + Sec.EDAttributes.FillByteValue, StringRef()); + Sec.Emitted = true; + } else + OS << Sec.getName() << " CATTR\n"; + break; + } + case GOFF::ESD_ST_PartReference: { + MCSectionGOFF *ED = Sec.getParent(); + printSwitchToSection(*ED->getParent(), Subsection, T, OS); + if (!Sec.Emitted) { + emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode, + ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior, + Sec.PRAttributes.Executable, ED->EDAttributes.IsReadOnly, + Sec.PRAttributes.SortKey, ED->EDAttributes.FillByteValue, + Sec.getName()); + emitXATTR(OS, Sec.getName(), Sec.PRAttributes.Linkage, + Sec.PRAttributes.Executable, Sec.PRAttributes.BindingScope); + ED->Emitted = true; + Sec.Emitted = true; + } else + OS << ED->getName() << " CATTR PART(" << Sec.getName() << ")\n"; + break; + } + default: + llvm_unreachable("Wrong section type"); + } +} diff --git a/llvm/lib/MC/MCAsmInfoWasm.cpp b/llvm/lib/MC/MCAsmInfoWasm.cpp index ce6ec7e..5e44f48 100644 --- a/llvm/lib/MC/MCAsmInfoWasm.cpp +++ b/llvm/lib/MC/MCAsmInfoWasm.cpp @@ -12,9 +12,11 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoWasm.h" -using namespace llvm; +#include "llvm/MC/MCSectionWasm.h" +#include "llvm/MC/MCSymbolWasm.h" +#include "llvm/Support/raw_ostream.h" -void MCAsmInfoWasm::anchor() {} +using namespace llvm; MCAsmInfoWasm::MCAsmInfoWasm() { HasIdentDirective = true; @@ -23,3 +25,80 @@ MCAsmInfoWasm::MCAsmInfoWasm() { PrivateGlobalPrefix = ".L"; PrivateLabelPrefix = ".L"; } + +static void printName(raw_ostream &OS, StringRef Name) { + if (Name.find_first_not_of("0123456789_." + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { + OS << Name; + return; + } + OS << '"'; + for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { + if (*B == '"') // Unquoted " + OS << "\\\""; + else if (*B != '\\') // Neither " or backslash + OS << *B; + else if (B + 1 == E) // Trailing backslash + OS << "\\\\"; + else { + OS << B[0] << B[1]; // Quoted character + ++B; + } + } + OS << '"'; +} + +void MCAsmInfoWasm::printSwitchToSection(const MCSection &Section, + uint32_t Subsection, const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionWasm &>(Section); + if (shouldOmitSectionDirective(Sec.getName())) { + OS << '\t' << Sec.getName(); + if (Subsection) + OS << '\t' << Subsection; + OS << '\n'; + return; + } + + OS << "\t.section\t"; + printName(OS, Sec.getName()); + OS << ",\""; + + if (Sec.IsPassive) + OS << 'p'; + if (Sec.Group) + OS << 'G'; + if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS) + OS << 'S'; + if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_TLS) + OS << 'T'; + if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN) + OS << 'R'; + + OS << '"'; + + OS << ','; + + // If comment string is '@', e.g. as on ARM - use '%' instead + if (getCommentString()[0] == '@') + OS << '%'; + else + OS << '@'; + + // TODO: Print section type. + + if (Sec.Group) { + OS << ","; + printName(OS, Sec.Group->getName()); + OS << ",comdat"; + } + + if (Sec.isUnique()) + OS << ",unique," << Sec.UniqueID; + + OS << '\n'; + + if (Subsection) + OS << "\t.subsection\t" << Subsection << '\n'; +} diff --git a/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/llvm/lib/MC/MCAsmInfoXCOFF.cpp index 6ef11ba..0403b44 100644 --- a/llvm/lib/MC/MCAsmInfoXCOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoXCOFF.cpp @@ -8,7 +8,11 @@ #include "llvm/MC/MCAsmInfoXCOFF.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSectionXCOFF.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -16,8 +20,6 @@ namespace llvm { extern cl::opt<cl::boolOrDefault> UseLEB128Directives; } -void MCAsmInfoXCOFF::anchor() {} - MCAsmInfoXCOFF::MCAsmInfoXCOFF() { IsAIX = true; IsLittleEndian = false; @@ -56,3 +58,121 @@ bool MCAsmInfoXCOFF::isAcceptableChar(char C) const { // any combination of these. return isAlnum(C) || C == '_' || C == '.'; } + +bool MCAsmInfoXCOFF::useCodeAlign(const MCSection &Sec) const { + return static_cast<const MCSectionXCOFF &>(Sec).getKind().isText(); +} + +MCSectionXCOFF::~MCSectionXCOFF() = default; + +void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const { + OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n'; +} + +void MCAsmInfoXCOFF::printSwitchToSection(const MCSection &Section, uint32_t, + const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionXCOFF &>(Section); + if (Sec.getKind().isText()) { + if (Sec.getMappingClass() != XCOFF::XMC_PR) + report_fatal_error("Unhandled storage-mapping class for .text csect"); + + Sec.printCsectDirective(OS); + return; + } + + if (Sec.getKind().isReadOnly()) { + if (Sec.getMappingClass() != XCOFF::XMC_RO && + Sec.getMappingClass() != XCOFF::XMC_TD) + report_fatal_error("Unhandled storage-mapping class for .rodata csect."); + Sec.printCsectDirective(OS); + return; + } + + if (Sec.getKind().isReadOnlyWithRel()) { + if (Sec.getMappingClass() != XCOFF::XMC_RW && + Sec.getMappingClass() != XCOFF::XMC_RO && + Sec.getMappingClass() != XCOFF::XMC_TD) + report_fatal_error( + "Unexepected storage-mapping class for ReadOnlyWithRel kind"); + Sec.printCsectDirective(OS); + return; + } + + // Initialized TLS data. + if (Sec.getKind().isThreadData()) { + // We only expect XMC_TL here for initialized TLS data. + if (Sec.getMappingClass() != XCOFF::XMC_TL) + report_fatal_error("Unhandled storage-mapping class for .tdata csect."); + Sec.printCsectDirective(OS); + return; + } + + if (Sec.getKind().isData()) { + switch (Sec.getMappingClass()) { + case XCOFF::XMC_RW: + case XCOFF::XMC_DS: + case XCOFF::XMC_TD: + Sec.printCsectDirective(OS); + break; + case XCOFF::XMC_TC: + case XCOFF::XMC_TE: + break; + case XCOFF::XMC_TC0: + OS << "\t.toc\n"; + break; + default: + report_fatal_error("Unhandled storage-mapping class for .data csect."); + } + return; + } + + if (Sec.isCsect() && Sec.getMappingClass() == XCOFF::XMC_TD) { + // Common csect type (uninitialized storage) does not have to print + // csect directive for section switching unless it is local. + if (Sec.getKind().isCommon() && !Sec.getKind().isBSSLocal()) + return; + + assert(Sec.getKind().isBSS() && "Unexpected section kind for toc-data"); + Sec.printCsectDirective(OS); + return; + } + // Common csect type (uninitialized storage) does not have to print csect + // directive for section switching. + if (Sec.isCsect() && Sec.getCSectType() == XCOFF::XTY_CM) { + assert((Sec.getMappingClass() == XCOFF::XMC_RW || + Sec.getMappingClass() == XCOFF::XMC_BS || + Sec.getMappingClass() == XCOFF::XMC_UL) && + "Generated a storage-mapping class for a common/bss/tbss csect we " + "don't " + "understand how to switch to."); + // Common symbols and local zero-initialized symbols for TLS and Non-TLS are + // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to + // cover TLS common and zero-initialized local symbols since linkage type + // (in the GlobalVariable) is not accessible in this class. + assert((Sec.getKind().isBSSLocal() || Sec.getKind().isCommon() || + Sec.getKind().isThreadBSS()) && + "wrong symbol type for .bss/.tbss csect"); + // Don't have to print a directive for switching to section for commons + // and zero-initialized TLS data. The '.comm' and '.lcomm' directives of the + // variable will create the needed csect. + return; + } + + // Zero-initialized TLS data with weak or external linkage are not eligible to + // be put into common csect. + if (Sec.getKind().isThreadBSS()) { + Sec.printCsectDirective(OS); + return; + } + + // XCOFF debug sections. + if (Sec.getKind().isMetadata() && Sec.isDwarfSect()) { + OS << "\n\t.dwsect " << format("0x%" PRIx32, *Sec.getDwarfSubtypeFlags()) + << '\n'; + OS << Sec.getName() << ':' << '\n'; + return; + } + + report_fatal_error("Printing for this SectionKind is unimplemented."); +} diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 67c53e0..da51da4 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -345,7 +345,7 @@ public: void emitIdent(StringRef IdentString) override; void emitCFIBKeyFrame() override; void emitCFIMTETaggedFrame() override; - void emitCFISections(bool EH, bool Debug) override; + void emitCFISections(bool EH, bool Debug, bool SFrame) override; void emitCFIDefCfa(int64_t Register, int64_t Offset, SMLoc Loc) override; void emitCFIDefCfaOffset(int64_t Offset, SMLoc Loc) override; void emitCFIDefCfaRegister(int64_t Register, SMLoc Loc) override; @@ -532,8 +532,8 @@ void MCAsmStreamer::switchSection(MCSection *Section, uint32_t Subsection) { if (MCTargetStreamer *TS = getTargetStreamer()) { TS->changeSection(Cur.first, Section, Subsection, OS); } else { - Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, - Subsection); + MAI->printSwitchToSection(*Section, Subsection, + getContext().getTargetTriple(), OS); } } MCStreamer::switchSection(Section, Subsection); @@ -543,7 +543,7 @@ bool MCAsmStreamer::popSection() { if (!MCStreamer::popSection()) return false; auto [Sec, Subsec] = getCurrentSection(); - Sec->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, Subsec); + MAI->printSwitchToSection(*Sec, Subsec, getContext().getTargetTriple(), OS); return true; } @@ -1105,7 +1105,7 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol, // Note: a .zerofill directive does not switch sections. OS << ".zerofill "; - assert(Section->getVariant() == MCSection::SV_MachO && + assert(getContext().getObjectFileType() == MCContext::IsMachO && ".zerofill is a Mach-O specific directive"); // This is a mach-o specific directive. @@ -1130,7 +1130,7 @@ void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, // Instead of using the Section we'll just use the shortcut. - assert(Section->getVariant() == MCSection::SV_MachO && + assert(getContext().getObjectFileType() == MCContext::IsMachO && ".zerofill is a Mach-O specific directive"); // This is a mach-o specific directive and section. @@ -1906,15 +1906,24 @@ void MCAsmStreamer::emitIdent(StringRef IdentString) { EmitEOL(); } -void MCAsmStreamer::emitCFISections(bool EH, bool Debug) { - MCStreamer::emitCFISections(EH, Debug); +void MCAsmStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) { + MCStreamer::emitCFISections(EH, Debug, SFrame); OS << "\t.cfi_sections "; + bool C = false; if (EH) { OS << ".eh_frame"; - if (Debug) - OS << ", .debug_frame"; - } else if (Debug) { + C = true; + } + if (Debug) { + if (C) + OS << ", "; OS << ".debug_frame"; + C = true; + } + if (SFrame) { + if (C) + OS << ", "; + OS << ".sframe"; } EmitEOL(); diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index e142ac1..8500fd1 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -106,7 +106,6 @@ void MCAssembler::reset() { bool MCAssembler::registerSection(MCSection &Section) { if (Section.isRegistered()) return false; - assert(Section.curFragList()->Head && "allocInitialFragment not called"); Sections.push_back(&Section); Section.setIsRegistered(true); return true; diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 12b3fba..39bf628 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -200,16 +200,6 @@ MCInst *MCContext::createMCInst() { return new (MCInstAllocator.Allocate()) MCInst; } -// Allocate the initial MCFragment for the begin symbol. -MCFragment *MCContext::allocInitialFragment(MCSection &Sec) { - assert(!Sec.curFragList()->Head); - auto *F = allocFragment<MCFragment>(); - F->setParent(&Sec); - Sec.curFragList()->Head = F; - Sec.curFragList()->Tail = F; - return F; -} - //===----------------------------------------------------------------------===// // Symbol Manipulation //===----------------------------------------------------------------------===// @@ -443,17 +433,19 @@ MCSymbol *MCContext::getDirectionalLocalSymbol(unsigned LocalLabelVal, return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance); } +// Create a section symbol, with a distinct one for each section of the same. +// The first symbol is used for assembly code references. template <typename Symbol> Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) { Symbol *R; auto &SymEntry = getSymbolTableEntry(Section); MCSymbol *Sym = SymEntry.second.Symbol; - // A section symbol can not redefine regular symbols. There may be multiple - // sections with the same name, in which case the first such section wins. if (Sym && Sym->isDefined() && (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym)) reportError(SMLoc(), "invalid symbol redefinition"); - if (Sym && Sym->isUndefined()) { + // Use the symbol's index to track if it has been used as a section symbol. + // Set to -1 to catch potential bugs if misused as a symbol index. + if (Sym && Sym->getIndex() != -1u) { R = cast<Symbol>(Sym); } else { SymEntry.second.Used = true; @@ -461,6 +453,8 @@ Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) { if (!Sym) SymEntry.second.Symbol = R; } + // Mark as section symbol. + R->setIndex(-1u); return R; } @@ -568,7 +562,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section, MCSectionMachO(Segment, Name.substr(Name.size() - Section.size()), TypeAndAttributes, Reserved2, Kind, Begin); R.first->second = Ret; - allocInitialFragment(*Ret); return Ret; } @@ -579,15 +572,8 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type, bool Comdat, unsigned UniqueID, const MCSymbolELF *LinkedToSym) { auto *R = getOrCreateSectionSymbol<MCSymbolELF>(Section); - R->setBinding(ELF::STB_LOCAL); - R->setType(ELF::STT_SECTION); - - auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF( + return new (ELFAllocator.Allocate()) MCSectionELF( Section, Type, Flags, EntrySize, Group, Comdat, UniqueID, R, LinkedToSym); - - auto *F = allocInitialFragment(*Ret); - R->setFragment(F); - return Ret; } MCSectionELF * @@ -743,7 +729,6 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name, MCSectionGOFF(CachedName, Kind, IsVirtual, Attributes, static_cast<MCSectionGOFF *>(Parent)); Iter->second = GOFFSection; - allocInitialFragment(*GOFFSection); return GOFFSection; } @@ -782,8 +767,8 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE && COMDATSymbol->isDefined() && (!COMDATSymbol->isInSection() || - cast<MCSectionCOFF>(COMDATSymbol->getSection()).getCOMDATSymbol() != - COMDATSymbol)) + static_cast<const MCSectionCOFF &>(COMDATSymbol->getSection()) + .getCOMDATSymbol() != COMDATSymbol)) reportError(SMLoc(), "invalid symbol redefinition"); } @@ -798,8 +783,7 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF( CachedName, Characteristics, COMDATSymbol, Selection, UniqueID, Begin); Iter->second = Result; - auto *F = allocInitialFragment(*Result); - Begin->setFragment(F); + Begin->setFragment(&Result->getDummyFragment()); return Result; } @@ -870,8 +854,6 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind, MCSectionWasm(CachedName, Kind, Flags, GroupSym, UniqueID, Begin); Entry.second = Result; - auto *F = allocInitialFragment(*Result); - Begin->setFragment(F); return Result; } @@ -927,24 +909,11 @@ MCSectionXCOFF *MCContext::getXCOFFSection( MultiSymbolsAllowed); Entry.second = Result; - - auto *F = allocInitialFragment(*Result); - - // We might miss calculating the symbols difference as absolute value before - // adding fixups when symbol_A without the fragment set is the csect itself - // and symbol_B is in it. - // TODO: Currently we only set the fragment for XMC_PR csects and DWARF - // sections because we don't have other cases that hit this problem yet. - if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR) - QualName->setFragment(F); - return Result; } MCSectionSPIRV *MCContext::getSPIRVSection() { MCSectionSPIRV *Result = new (SPIRVAllocator.Allocate()) MCSectionSPIRV(); - - allocInitialFragment(*Result); return Result; } @@ -964,7 +933,6 @@ MCSectionDXContainer *MCContext::getDXContainerSection(StringRef Section, new (DXCAllocator.Allocate()) MCSectionDXContainer(Name, K, nullptr); // The first fragment will store the header - allocInitialFragment(*MapIt->second); return MapIt->second; } diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index b8cbaea5..38744a0 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -89,7 +89,9 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { getWriter().markGnuAbi(); MCObjectStreamer::changeSection(Section, Subsection); - Asm.registerSymbol(*Section->getBeginSymbol()); + auto *Sym = static_cast<MCSymbolELF *>(Section->getBeginSymbol()); + Sym->setBinding(ELF::STB_LOCAL); + Sym->setType(ELF::STT_SECTION); } void MCELFStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Target) { diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index dbb2fd1..c24c82d 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -346,17 +346,16 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, Displacement *= -1; } - // Track whether B is before a relaxable instruction and whether A is after - // a relaxable instruction. If SA and SB are separated by a linker-relaxable - // instruction, the difference cannot be resolved as it may be changed by - // the linker. + // Track whether B is before a relaxable instruction/alignment and whether A + // is after a relaxable instruction/alignment. If SA and SB are separated by + // a linker-relaxable instruction/alignment, the difference cannot be + // resolved as it may be changed by the linker. bool BBeforeRelax = false, AAfterRelax = false; for (auto F = FB; F; F = F->getNext()) { - auto DF = F->getKind() == MCFragment::FT_Data ? F : nullptr; - if (DF && DF->isLinkerRelaxable()) { - if (&*F != FB || SBOffset != DF->getContents().size()) + if (F && F->isLinkerRelaxable()) { + if (&*F != FB || SBOffset != F->getSize()) BBeforeRelax = true; - if (&*F != FA || SAOffset == DF->getContents().size()) + if (&*F != FA || SAOffset == F->getSize()) AAfterRelax = true; if (BBeforeRelax && AAfterRelax) return; @@ -370,17 +369,15 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, } int64_t Num; - if (DF) { - Displacement += DF->getContents().size(); - } else if (F->getKind() == MCFragment::FT_Relaxable && + if (F->getKind() == MCFragment::FT_Data) { + Displacement += F->getFixedSize(); + } else if ((F->getKind() == MCFragment::FT_Relaxable || + F->getKind() == MCFragment::FT_Align) && Asm->hasFinalLayout()) { // Before finishLayout, a relaxable fragment's size is indeterminate. // After layout, during relocation generation, it can be treated as a // data fragment. Displacement += F->getSize(); - } else if (F->getKind() == MCFragment::FT_Align && Layout && - F->isLinkerRelaxable()) { - Displacement += Asm->computeFragmentSize(*F); } else if (auto *FF = dyn_cast<MCFillFragment>(F); FF && FF->getNumValues().evaluateAsAbsolute(Num)) { Displacement += Num * FF->getValueSize(); diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index 3c395e5..6cbdf74 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -35,7 +35,7 @@ MCFragment::MCFragment(FragmentType Kind, bool HasInstructions) } const MCSymbol *MCFragment::getAtom() const { - return cast<MCSectionMachO>(Parent)->getAtom(LayoutOrder); + return static_cast<const MCSectionMachO *>(Parent)->getAtom(LayoutOrder); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp index b702191..1718e2a 100644 --- a/llvm/lib/MC/MCGOFFStreamer.cpp +++ b/llvm/lib/MC/MCGOFFStreamer.cpp @@ -26,19 +26,15 @@ GOFFObjectWriter &MCGOFFStreamer::getWriter() { return static_cast<GOFFObjectWriter &>(getAssembler().getWriter()); } -// Make sure that all section are registered in the correct order. -static void registerSectionHierarchy(MCAssembler &Asm, MCSectionGOFF *Section) { - if (Section->isRegistered()) - return; - if (Section->getParent()) - registerSectionHierarchy(Asm, Section->getParent()); - Asm.registerSection(*Section); -} - void MCGOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - registerSectionHierarchy(getAssembler(), - static_cast<MCSectionGOFF *>(Section)); - MCObjectStreamer::changeSection(Section, Subsection); + // Make sure that all section are registered in the correct order. + SmallVector<MCSectionGOFF *> Sections; + for (auto *S = static_cast<MCSectionGOFF *>(Section); S; S = S->getParent()) + Sections.push_back(S); + while (!Sections.empty()) { + auto *S = Sections.pop_back_val(); + MCObjectStreamer::changeSection(S, Sections.empty() ? Subsection : 0); + } } MCStreamer *llvm::createGOFFStreamer(MCContext &Context, diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 8c3332c..a214513 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -140,6 +140,8 @@ void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) { MCSymbol *Label = getContext().createLinkerPrivateTempSymbol(); Section->setBeginSymbol(Label); HasSectionLabel[Section] = true; + if (!Label->isInSection()) + emitLabel(Label); } } @@ -441,13 +443,13 @@ void MCMachOStreamer::finishImpl() { // Set the fragment atom associations by tracking the last seen atom defining // symbol. for (MCSection &Sec : getAssembler()) { - cast<MCSectionMachO>(Sec).allocAtoms(); + static_cast<MCSectionMachO &>(Sec).allocAtoms(); const MCSymbol *CurrentAtom = nullptr; size_t I = 0; for (MCFragment &Frag : Sec) { if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag)) CurrentAtom = Symbol; - cast<MCSectionMachO>(Sec).setAtom(I++, CurrentAtom); + static_cast<MCSectionMachO &>(Sec).setAtom(I++, CurrentAtom); } } @@ -482,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() { // For each entry, reserve space for 2 32-bit indices and a 64-bit count. size_t SectionBytes = W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t)); - (*CGProfileSection->begin()).appendContents(SectionBytes, 0); + (*CGProfileSection->begin()) + .setVarContents(std::vector<char>(SectionBytes, 0)); } MCStreamer *llvm::createMachOStreamer(MCContext &Context, @@ -518,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() { // (instead of emitting a zero-sized section) so these relocations are // technically valid, even though we don't expect these relocations to // actually be applied by the linker. - Frag->appendContents(8, 0); + constexpr char zero[8] = {}; + Frag->setVarContents(zero); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 89f4da5..e277143 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -57,6 +57,10 @@ void MCObjectStreamer::insert(MCFragment *F) { newFragment(); } +void MCObjectStreamer::appendContents(ArrayRef<char> Contents) { + CurFrag->appendContents(Contents); +} + void MCObjectStreamer::appendContents(size_t Num, char Elt) { CurFrag->appendContents(Num, Elt); } @@ -129,10 +133,11 @@ void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) { Assembler->registerSymbol(Sym); } -void MCObjectStreamer::emitCFISections(bool EH, bool Debug) { - MCStreamer::emitCFISections(EH, Debug); +void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) { + MCStreamer::emitCFISections(EH, Debug, SFrame); EmitEHFrame = EH; EmitDebugFrame = Debug; + EmitSFrame = SFrame; } void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, @@ -184,10 +189,10 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { getAssembler().registerSymbol(*Symbol); - // If there is a current fragment, mark the symbol as pointing into it. - // Otherwise queue the label and set its fragment pointer when we emit the - // next fragment. - MCFragment *F = getCurrentFragment(); + // Set the fragment and offset. This function might be called by + // changeSection, when the section stack top hasn't been changed to the new + // section. + MCFragment *F = CurFrag; Symbol->setFragment(F); Symbol->setOffset(F->getContents().size()); @@ -246,6 +251,15 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { assert(Section && "Cannot switch to a null section!"); getContext().clearDwarfLocSeen(); + // Register the section and create an initial fragment for subsection 0 + // if `Subsection` is non-zero. + bool NewSec = getAssembler().registerSection(*Section); + MCFragment *F0 = nullptr; + if (NewSec && Subsection) { + changeSection(Section, 0); + F0 = CurFrag; + } + auto &Subsections = Section->Subsections; size_t I = 0, E = Subsections.size(); while (I != E && Subsections[I].first < Subsection) @@ -261,12 +275,13 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { Section->CurFragList = &Subsections[I].second; CurFrag = Section->CurFragList->Tail; - getAssembler().registerSection(*Section); -} - -void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) { - MCStreamer::switchSectionNoPrint(Section); - changeSection(Section, 0); + // Define the section symbol at subsection 0's initial fragment if required. + if (!NewSec) + return; + if (auto *Sym = Section->getBeginSymbol()) { + Sym->setFragment(Subsection ? F0 : CurFrag); + getAssembler().registerSymbol(*Sym); + } } void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { @@ -329,31 +344,33 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, MCFragment *F = getCurrentFragment(); // Append the instruction to the data fragment. - size_t FixupStartIndex = F->getFixups().size(); size_t CodeOffset = F->getContents().size(); SmallVector<MCFixup, 1> Fixups; getAssembler().getEmitter().encodeInstruction( Inst, F->getContentsForAppending(), Fixups, STI); F->doneAppending(); - if (!Fixups.empty()) - F->appendFixups(Fixups); F->setHasInstructions(STI); + if (Fixups.empty()) + return; bool MarkedLinkerRelaxable = false; - for (auto &Fixup : MutableArrayRef(F->getFixups()).slice(FixupStartIndex)) { + for (auto &Fixup : Fixups) { Fixup.setOffset(Fixup.getOffset() + CodeOffset); - if (!Fixup.isLinkerRelaxable()) + if (!Fixup.isLinkerRelaxable() || MarkedLinkerRelaxable) continue; - F->setLinkerRelaxable(); + MarkedLinkerRelaxable = true; + // Set the fragment's order within the subsection for use by + // MCAssembler::relaxAlign. + auto *Sec = F->getParent(); + if (!Sec->isLinkerRelaxable()) + Sec->setLinkerRelaxable(); // Do not add data after a linker-relaxable instruction. The difference // between a new label and a label at or before the linker-relaxable // instruction cannot be resolved at assemble-time. - if (!MarkedLinkerRelaxable) { - MarkedLinkerRelaxable = true; - getCurrentSectionOnly()->setLinkerRelaxable(); - newFragment(); - } + F->setLinkerRelaxable(); + newFragment(); } + F->appendFixups(Fixups); } void MCObjectStreamer::emitInstToFragment(const MCInst &Inst, @@ -525,8 +542,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) { void MCObjectStreamer::emitBytes(StringRef Data) { MCDwarfLineEntry::make(this, getCurrentSectionOnly()); - MCFragment *DF = getCurrentFragment(); - DF->appendContents(ArrayRef(Data.data(), Data.size())); + appendContents(ArrayRef(Data.data(), Data.size())); } void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill, diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index d0b6ea4..9f64a98 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -3413,7 +3413,7 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) { // Check whether we should use optimal code alignment for this .align // directive. - if (Section->useCodeAlign() && !HasFillExpr) { + if (MAI.useCodeAlign(*Section) && !HasFillExpr) { getStreamer().emitCodeAlignment( Align(Alignment), &getTargetParser().getSTI(), MaxBytesToFill); } else { @@ -4093,27 +4093,30 @@ bool AsmParser::parseDirectiveCVFPOData() { } /// parseDirectiveCFISections -/// ::= .cfi_sections section [, section] +/// ::= .cfi_sections section [, section][, section] bool AsmParser::parseDirectiveCFISections() { StringRef Name; bool EH = false; bool Debug = false; + bool SFrame = false; if (!parseOptionalToken(AsmToken::EndOfStatement)) { for (;;) { if (parseIdentifier(Name)) - return TokError("expected .eh_frame or .debug_frame"); + return TokError("expected .eh_frame, .debug_frame, or .sframe"); if (Name == ".eh_frame") EH = true; else if (Name == ".debug_frame") Debug = true; + else if (Name == ".sframe") + SFrame = true; if (parseOptionalToken(AsmToken::EndOfStatement)) break; if (parseComma()) return true; } } - getStreamer().emitCFISections(EH, Debug); + getStreamer().emitCFISections(EH, Debug, SFrame); return false; } diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index c7c3df3..2e251cc 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -644,8 +644,8 @@ EndStmt: } if (UseLastGroup) { - if (const MCSectionELF *Section = - cast_or_null<MCSectionELF>(getStreamer().getCurrentSectionOnly())) + if (auto *Section = static_cast<const MCSectionELF *>( + getStreamer().getCurrentSectionOnly())) if (const MCSymbol *Group = Section->getGroup()) { GroupName = Group->getName(); IsComdat = Section->isComdat(); diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index f4684e6..780289e 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -4228,8 +4228,7 @@ bool MasmParser::emitAlignTo(int64_t Alignment) { // Check whether we should use optimal code alignment for this align // directive. const MCSection *Section = getStreamer().getCurrentSectionOnly(); - assert(Section && "must have section to emit alignment"); - if (Section->useCodeAlign()) { + if (MAI.useCodeAlign(*Section)) { getStreamer().emitCodeAlignment(Align(Alignment), &getTargetParser().getSTI(), /*MaxBytesToEmit=*/0); diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp index 1f824b8..d97f4f5 100644 --- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp +++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp @@ -252,7 +252,7 @@ public: if (TypeName == "function") { WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); auto *Current = - cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly()); + static_cast<MCSectionWasm *>(getStreamer().getCurrentSectionOnly()); if (Current->getGroup()) WasmSym->setComdat(true); } else if (TypeName == "global") diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 023f7f2..4f28267 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -18,12 +18,10 @@ using namespace llvm; -MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss, - MCSymbol *Begin) +MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin) : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText), - IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) { - // The initial subsection number is 0. Create a fragment list. - CurFragList = &Subsections.emplace_back(0u, FragList{}).second; + IsBss(IsBss), LinkerRelaxable(false), Name(Name) { + DummyFragment.setParent(this); } MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) { diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp deleted file mode 100644 index 5bf1473..0000000 --- a/llvm/lib/MC/MCSectionCOFF.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//===- lib/MC/MCSectionCOFF.cpp - COFF Code Section Representation --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionCOFF.h" -#include "llvm/BinaryFormat/COFF.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/raw_ostream.h" -#include <cassert> - -using namespace llvm; - -// shouldOmitSectionDirective - Decides whether a '.section' directive -// should be printed before the section name -bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name, - const MCAsmInfo &MAI) const { - if (COMDATSymbol || isUnique()) - return false; - - // FIXME: Does .section .bss/.data/.text work everywhere?? - if (Name == ".text" || Name == ".data" || Name == ".bss") - return true; - - return false; -} - -void MCSectionCOFF::setSelection(int Selection) const { - assert(Selection != 0 && "invalid COMDAT selection type"); - this->Selection = Selection; - Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; -} - -void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - // standard sections don't require the '.section' - if (shouldOmitSectionDirective(getName(), MAI)) { - OS << '\t' << getName() << '\n'; - return; - } - - OS << "\t.section\t" << getName() << ",\""; - if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA) - OS << 'd'; - if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) - OS << 'b'; - if (getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE) - OS << 'x'; - if (getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE) - OS << 'w'; - else if (getCharacteristics() & COFF::IMAGE_SCN_MEM_READ) - OS << 'r'; - else - OS << 'y'; - if (getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE) - OS << 'n'; - if (getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED) - OS << 's'; - if ((getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) && - !isImplicitlyDiscardable(getName())) - OS << 'D'; - if (getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO) - OS << 'i'; - OS << '"'; - - // unique should be tail of .section directive. - if (isUnique() && !COMDATSymbol) - OS << ",unique," << UniqueID; - - if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) { - if (COMDATSymbol) - OS << ","; - else - OS << "\n\t.linkonce\t"; - switch (Selection) { - case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES: - OS << "one_only"; - break; - case COFF::IMAGE_COMDAT_SELECT_ANY: - OS << "discard"; - break; - case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE: - OS << "same_size"; - break; - case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH: - OS << "same_contents"; - break; - case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE: - OS << "associative"; - break; - case COFF::IMAGE_COMDAT_SELECT_LARGEST: - OS << "largest"; - break; - case COFF::IMAGE_COMDAT_SELECT_NEWEST: - OS << "newest"; - break; - default: - assert(false && "unsupported COFF selection type"); - break; - } - if (COMDATSymbol) { - OS << ","; - COMDATSymbol->print(OS, &MAI); - } - } - - if (isUnique() && COMDATSymbol) - OS << ",unique," << UniqueID; - - OS << '\n'; -} - -bool MCSectionCOFF::useCodeAlign() const { return isText(); } diff --git a/llvm/lib/MC/MCSectionDXContainer.cpp b/llvm/lib/MC/MCSectionDXContainer.cpp deleted file mode 100644 index 7eee59d..0000000 --- a/llvm/lib/MC/MCSectionDXContainer.cpp +++ /dev/null @@ -1,15 +0,0 @@ -//===- lib/MC/MCSectionDXContainer.cpp - DXContainer Section --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionDXContainer.h" - -using namespace llvm; - -void MCSectionDXContainer::printSwitchToSection(const MCAsmInfo &, - const Triple &, raw_ostream &, - uint32_t) const {} diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp deleted file mode 100644 index ef33f9c..0000000 --- a/llvm/lib/MC/MCSectionELF.cpp +++ /dev/null @@ -1,217 +0,0 @@ -//===- lib/MC/MCSectionELF.cpp - ELF Code Section Representation ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionELF.h" -#include "llvm/ADT/Twine.h" -#include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/TargetParser/Triple.h" -#include <cassert> - -using namespace llvm; - -// Decides whether a '.section' directive -// should be printed before the section name. -bool MCSectionELF::shouldOmitSectionDirective(StringRef Name, - const MCAsmInfo &MAI) const { - if (isUnique()) - return false; - - return MAI.shouldOmitSectionDirective(Name); -} - -static void printName(raw_ostream &OS, StringRef Name) { - if (Name.find_first_not_of("0123456789_." - "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { - OS << Name; - return; - } - OS << '"'; - for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { - if (*B == '"') // Unquoted " - OS << "\\\""; - else if (*B != '\\') // Neither " or backslash - OS << *B; - else if (B + 1 == E) // Trailing backslash - OS << "\\\\"; - else { - OS << B[0] << B[1]; // Quoted character - ++B; - } - } - OS << '"'; -} - -void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - if (shouldOmitSectionDirective(getName(), MAI)) { - OS << '\t' << getName(); - if (Subsection) - OS << '\t' << Subsection; - OS << '\n'; - return; - } - - OS << "\t.section\t"; - printName(OS, getName()); - - // Handle the weird solaris syntax if desired. - if (MAI.usesSunStyleELFSectionSwitchSyntax() && - !(Flags & ELF::SHF_MERGE)) { - if (Flags & ELF::SHF_ALLOC) - OS << ",#alloc"; - if (Flags & ELF::SHF_EXECINSTR) - OS << ",#execinstr"; - if (Flags & ELF::SHF_WRITE) - OS << ",#write"; - if (Flags & ELF::SHF_EXCLUDE) - OS << ",#exclude"; - if (Flags & ELF::SHF_TLS) - OS << ",#tls"; - OS << '\n'; - return; - } - - OS << ",\""; - if (Flags & ELF::SHF_ALLOC) - OS << 'a'; - if (Flags & ELF::SHF_EXCLUDE) - OS << 'e'; - if (Flags & ELF::SHF_EXECINSTR) - OS << 'x'; - if (Flags & ELF::SHF_WRITE) - OS << 'w'; - if (Flags & ELF::SHF_MERGE) - OS << 'M'; - if (Flags & ELF::SHF_STRINGS) - OS << 'S'; - if (Flags & ELF::SHF_TLS) - OS << 'T'; - if (Flags & ELF::SHF_LINK_ORDER) - OS << 'o'; - if (Flags & ELF::SHF_GROUP) - OS << 'G'; - if (Flags & ELF::SHF_GNU_RETAIN) - OS << 'R'; - - // If there are os-specific flags, print them. - if (T.isOSSolaris()) - if (Flags & ELF::SHF_SUNW_NODISCARD) - OS << 'R'; - - // If there are target-specific flags, print them. - Triple::ArchType Arch = T.getArch(); - if (Arch == Triple::xcore) { - if (Flags & ELF::XCORE_SHF_CP_SECTION) - OS << 'c'; - if (Flags & ELF::XCORE_SHF_DP_SECTION) - OS << 'd'; - } else if (T.isARM() || T.isThumb()) { - if (Flags & ELF::SHF_ARM_PURECODE) - OS << 'y'; - } else if (T.isAArch64()) { - if (Flags & ELF::SHF_AARCH64_PURECODE) - OS << 'y'; - } else if (Arch == Triple::hexagon) { - if (Flags & ELF::SHF_HEX_GPREL) - OS << 's'; - } else if (Arch == Triple::x86_64) { - if (Flags & ELF::SHF_X86_64_LARGE) - OS << 'l'; - } - - OS << '"'; - - OS << ','; - - // If comment string is '@', e.g. as on ARM - use '%' instead - if (MAI.getCommentString()[0] == '@') - OS << '%'; - else - OS << '@'; - - if (Type == ELF::SHT_INIT_ARRAY) - OS << "init_array"; - else if (Type == ELF::SHT_FINI_ARRAY) - OS << "fini_array"; - else if (Type == ELF::SHT_PREINIT_ARRAY) - OS << "preinit_array"; - else if (Type == ELF::SHT_NOBITS) - OS << "nobits"; - else if (Type == ELF::SHT_NOTE) - OS << "note"; - else if (Type == ELF::SHT_PROGBITS) - OS << "progbits"; - else if (Type == ELF::SHT_X86_64_UNWIND) - OS << "unwind"; - else if (Type == ELF::SHT_MIPS_DWARF) - // Print hex value of the flag while we do not have - // any standard symbolic representation of the flag. - OS << "0x7000001e"; - else if (Type == ELF::SHT_LLVM_ODRTAB) - OS << "llvm_odrtab"; - else if (Type == ELF::SHT_LLVM_LINKER_OPTIONS) - OS << "llvm_linker_options"; - else if (Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE) - OS << "llvm_call_graph_profile"; - else if (Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES) - OS << "llvm_dependent_libraries"; - else if (Type == ELF::SHT_LLVM_SYMPART) - OS << "llvm_sympart"; - else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP) - OS << "llvm_bb_addr_map"; - else if (Type == ELF::SHT_LLVM_OFFLOADING) - OS << "llvm_offloading"; - else if (Type == ELF::SHT_LLVM_LTO) - OS << "llvm_lto"; - else if (Type == ELF::SHT_LLVM_JT_SIZES) - OS << "llvm_jt_sizes"; - else if (Type == ELF::SHT_LLVM_CFI_JUMP_TABLE) - OS << "llvm_cfi_jump_table"; - else - OS << "0x" << Twine::utohexstr(Type); - - if (EntrySize) { - assert((Flags & ELF::SHF_MERGE) || Type == ELF::SHT_LLVM_CFI_JUMP_TABLE); - OS << "," << EntrySize; - } - - if (Flags & ELF::SHF_LINK_ORDER) { - OS << ","; - if (LinkedToSym) - printName(OS, LinkedToSym->getName()); - else - OS << '0'; - } - - if (Flags & ELF::SHF_GROUP) { - OS << ","; - printName(OS, Group.getPointer()->getName()); - if (isComdat()) - OS << ",comdat"; - } - - if (isUnique()) - OS << ",unique," << UniqueID; - - OS << '\n'; - - if (Subsection) { - OS << "\t.subsection\t" << Subsection; - OS << '\n'; - } -} - -bool MCSectionELF::useCodeAlign() const { - return getFlags() & ELF::SHF_EXECINSTR; -} diff --git a/llvm/lib/MC/MCSectionGOFF.cpp b/llvm/lib/MC/MCSectionGOFF.cpp deleted file mode 100644 index 8163e5b..0000000 --- a/llvm/lib/MC/MCSectionGOFF.cpp +++ /dev/null @@ -1,143 +0,0 @@ -//===- MCSectionGOFF.cpp - GOFF Code Section Representation ---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionGOFF.h" -#include "llvm/BinaryFormat/GOFF.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode, - GOFF::ESDAlignment Alignment, - GOFF::ESDLoadingBehavior LoadBehavior, - GOFF::ESDExecutable Executable, bool IsReadOnly, - uint32_t SortKey, uint8_t FillByteValue, - StringRef PartName) { - OS << Name << " CATTR "; - OS << "ALIGN(" << static_cast<unsigned>(Alignment) << ")," - << "FILL(" << static_cast<unsigned>(FillByteValue) << ")"; - switch (LoadBehavior) { - case GOFF::ESD_LB_Deferred: - OS << ",DEFLOAD"; - break; - case GOFF::ESD_LB_NoLoad: - OS << ",NOLOAD"; - break; - default: - break; - } - switch (Executable) { - case GOFF::ESD_EXE_CODE: - OS << ",EXECUTABLE"; - break; - case GOFF::ESD_EXE_DATA: - OS << ",NOTEXECUTABLE"; - break; - default: - break; - } - if (IsReadOnly) - OS << ",READONLY"; - if (Rmode != GOFF::ESD_RMODE_None) { - OS << ','; - OS << "RMODE("; - switch (Rmode) { - case GOFF::ESD_RMODE_24: - OS << "24"; - break; - case GOFF::ESD_RMODE_31: - OS << "31"; - break; - case GOFF::ESD_RMODE_64: - OS << "64"; - break; - case GOFF::ESD_RMODE_None: - break; - } - OS << ')'; - } - if (SortKey) - OS << ",PRIORITY(" << SortKey << ")"; - if (!PartName.empty()) - OS << ",PART(" << PartName << ")"; - OS << '\n'; -} - -static void emitXATTR(raw_ostream &OS, StringRef Name, - GOFF::ESDLinkageType Linkage, - GOFF::ESDExecutable Executable, - GOFF::ESDBindingScope BindingScope) { - OS << Name << " XATTR "; - OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),"; - if (Executable != GOFF::ESD_EXE_Unspecified) - OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA") - << "),"; - if (BindingScope != GOFF::ESD_BSC_Unspecified) { - OS << "SCOPE("; - switch (BindingScope) { - case GOFF::ESD_BSC_Section: - OS << "SECTION"; - break; - case GOFF::ESD_BSC_Module: - OS << "MODULE"; - break; - case GOFF::ESD_BSC_Library: - OS << "LIBRARY"; - break; - case GOFF::ESD_BSC_ImportExport: - OS << "EXPORT"; - break; - default: - break; - } - OS << ')'; - } - OS << '\n'; -} - -void MCSectionGOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - switch (SymbolType) { - case GOFF::ESD_ST_SectionDefinition: { - OS << Name << " CSECT\n"; - Emitted = true; - break; - } - case GOFF::ESD_ST_ElementDefinition: { - getParent()->printSwitchToSection(MAI, T, OS, Subsection); - if (!Emitted) { - emitCATTR(OS, Name, EDAttributes.Rmode, EDAttributes.Alignment, - EDAttributes.LoadBehavior, GOFF::ESD_EXE_Unspecified, - EDAttributes.IsReadOnly, 0, EDAttributes.FillByteValue, - StringRef()); - Emitted = true; - } else - OS << Name << " CATTR\n"; - break; - } - case GOFF::ESD_ST_PartReference: { - MCSectionGOFF *ED = getParent(); - ED->getParent()->printSwitchToSection(MAI, T, OS, Subsection); - if (!Emitted) { - emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode, - ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior, - PRAttributes.Executable, ED->EDAttributes.IsReadOnly, - PRAttributes.SortKey, ED->EDAttributes.FillByteValue, Name); - emitXATTR(OS, Name, PRAttributes.Linkage, PRAttributes.Executable, - PRAttributes.BindingScope); - ED->Emitted = true; - Emitted = true; - } else - OS << ED->getName() << " CATTR PART(" << Name << ")\n"; - break; - } - default: - llvm_unreachable("Wrong section type"); - } -}
\ No newline at end of file diff --git a/llvm/lib/MC/MCSectionMachO.cpp b/llvm/lib/MC/MCSectionMachO.cpp index 67453ce..67c8235 100644 --- a/llvm/lib/MC/MCSectionMachO.cpp +++ b/llvm/lib/MC/MCSectionMachO.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/SectionKind.h" #include "llvm/Support/raw_ostream.h" @@ -92,7 +93,7 @@ ENTRY("" /*FIXME*/, S_ATTR_LOC_RELOC) MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section, unsigned TAA, unsigned reserved2, SectionKind K, MCSymbol *Begin) - : MCSection(SV_MachO, Section, K.isText(), + : MCSection(Section, K.isText(), MachO::isVirtualSection(TAA & MachO::SECTION_TYPE), Begin), TypeAndAttributes(TAA), Reserved2(reserved2) { assert(Segment.size() <= 16 && Section.size() <= 16 && @@ -105,19 +106,20 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section, } } -void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - OS << "\t.section\t" << getSegmentName() << ',' << getName(); +void MCAsmInfoDarwin::printSwitchToSection(const MCSection &Section, uint32_t, + const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionMachO &>(Section); + OS << "\t.section\t" << Sec.getSegmentName() << ',' << Sec.getName(); // Get the section type and attributes. - unsigned TAA = getTypeAndAttributes(); + unsigned TAA = Sec.getTypeAndAttributes(); if (TAA == 0) { OS << '\n'; return; } - MachO::SectionType SectionType = getType(); + MachO::SectionType SectionType = Sec.getType(); assert(SectionType <= MachO::LAST_KNOWN_SECTION_TYPE && "Invalid SectionType specified!"); @@ -135,8 +137,8 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, if (SectionAttrs == 0) { // If we have a S_SYMBOL_STUBS size specified, print it along with 'none' as // the attribute specifier. - if (Reserved2 != 0) - OS << ",none," << Reserved2; + if (Sec.Reserved2 != 0) + OS << ",none," << Sec.Reserved2; OS << '\n'; return; } @@ -164,15 +166,11 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, assert(SectionAttrs == 0 && "Unknown section attributes!"); // If we have a S_SYMBOL_STUBS size specified, print it. - if (Reserved2 != 0) - OS << ',' << Reserved2; + if (Sec.Reserved2 != 0) + OS << ',' << Sec.Reserved2; OS << '\n'; } -bool MCSectionMachO::useCodeAlign() const { - return hasAttribute(MachO::S_ATTR_PURE_INSTRUCTIONS); -} - /// ParseSectionSpecifier - Parse the section specifier indicated by "Spec". /// This is a string that can appear after a .section directive in a mach-o /// flavored .s file. If successful, this fills in the specified Out diff --git a/llvm/lib/MC/MCSectionWasm.cpp b/llvm/lib/MC/MCSectionWasm.cpp deleted file mode 100644 index e25af1c..0000000 --- a/llvm/lib/MC/MCSectionWasm.cpp +++ /dev/null @@ -1,101 +0,0 @@ -//===- lib/MC/MCSectionWasm.cpp - Wasm Code Section Representation --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionWasm.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCSymbolWasm.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -// Decides whether a '.section' directive -// should be printed before the section name. -bool MCSectionWasm::shouldOmitSectionDirective(StringRef Name, - const MCAsmInfo &MAI) const { - return MAI.shouldOmitSectionDirective(Name); -} - -static void printName(raw_ostream &OS, StringRef Name) { - if (Name.find_first_not_of("0123456789_." - "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { - OS << Name; - return; - } - OS << '"'; - for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { - if (*B == '"') // Unquoted " - OS << "\\\""; - else if (*B != '\\') // Neither " or backslash - OS << *B; - else if (B + 1 == E) // Trailing backslash - OS << "\\\\"; - else { - OS << B[0] << B[1]; // Quoted character - ++B; - } - } - OS << '"'; -} - -void MCSectionWasm::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - - if (shouldOmitSectionDirective(getName(), MAI)) { - OS << '\t' << getName(); - if (Subsection) - OS << '\t' << Subsection; - OS << '\n'; - return; - } - - OS << "\t.section\t"; - printName(OS, getName()); - OS << ",\""; - - if (IsPassive) - OS << 'p'; - if (Group) - OS << 'G'; - if (SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS) - OS << 'S'; - if (SegmentFlags & wasm::WASM_SEG_FLAG_TLS) - OS << 'T'; - if (SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN) - OS << 'R'; - - OS << '"'; - - OS << ','; - - // If comment string is '@', e.g. as on ARM - use '%' instead - if (MAI.getCommentString()[0] == '@') - OS << '%'; - else - OS << '@'; - - // TODO: Print section type. - - if (Group) { - OS << ","; - printName(OS, Group->getName()); - OS << ",comdat"; - } - - if (isUnique()) - OS << ",unique," << UniqueID; - - OS << '\n'; - - if (Subsection) - OS << "\t.subsection\t" << Subsection << '\n'; -} - -bool MCSectionWasm::useCodeAlign() const { return false; } diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp deleted file mode 100644 index 41043b2..0000000 --- a/llvm/lib/MC/MCSectionXCOFF.cpp +++ /dev/null @@ -1,134 +0,0 @@ -//===- lib/MC/MCSectionXCOFF.cpp - XCOFF Code Section Representation ------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionXCOFF.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" -namespace llvm { -class MCExpr; -class Triple; -} // namespace llvm - -using namespace llvm; - -MCSectionXCOFF::~MCSectionXCOFF() = default; - -void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const { - OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n'; -} - -void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - if (getKind().isText()) { - if (getMappingClass() != XCOFF::XMC_PR) - report_fatal_error("Unhandled storage-mapping class for .text csect"); - - printCsectDirective(OS); - return; - } - - if (getKind().isReadOnly()) { - if (getMappingClass() != XCOFF::XMC_RO && - getMappingClass() != XCOFF::XMC_TD) - report_fatal_error("Unhandled storage-mapping class for .rodata csect."); - printCsectDirective(OS); - return; - } - - if (getKind().isReadOnlyWithRel()) { - if (getMappingClass() != XCOFF::XMC_RW && - getMappingClass() != XCOFF::XMC_RO && - getMappingClass() != XCOFF::XMC_TD) - report_fatal_error( - "Unexepected storage-mapping class for ReadOnlyWithRel kind"); - printCsectDirective(OS); - return; - } - - // Initialized TLS data. - if (getKind().isThreadData()) { - // We only expect XMC_TL here for initialized TLS data. - if (getMappingClass() != XCOFF::XMC_TL) - report_fatal_error("Unhandled storage-mapping class for .tdata csect."); - printCsectDirective(OS); - return; - } - - if (getKind().isData()) { - switch (getMappingClass()) { - case XCOFF::XMC_RW: - case XCOFF::XMC_DS: - case XCOFF::XMC_TD: - printCsectDirective(OS); - break; - case XCOFF::XMC_TC: - case XCOFF::XMC_TE: - break; - case XCOFF::XMC_TC0: - OS << "\t.toc\n"; - break; - default: - report_fatal_error( - "Unhandled storage-mapping class for .data csect."); - } - return; - } - - if (isCsect() && getMappingClass() == XCOFF::XMC_TD) { - // Common csect type (uninitialized storage) does not have to print csect - // directive for section switching unless it is local. - if (getKind().isCommon() && !getKind().isBSSLocal()) - return; - - assert(getKind().isBSS() && "Unexpected section kind for toc-data"); - printCsectDirective(OS); - return; - } - // Common csect type (uninitialized storage) does not have to print csect - // directive for section switching. - if (isCsect() && getCSectType() == XCOFF::XTY_CM) { - assert((getMappingClass() == XCOFF::XMC_RW || - getMappingClass() == XCOFF::XMC_BS || - getMappingClass() == XCOFF::XMC_UL) && - "Generated a storage-mapping class for a common/bss/tbss csect we " - "don't " - "understand how to switch to."); - // Common symbols and local zero-initialized symbols for TLS and Non-TLS are - // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to cover - // TLS common and zero-initialized local symbols since linkage type (in the - // GlobalVariable) is not accessible in this class. - assert((getKind().isBSSLocal() || getKind().isCommon() || - getKind().isThreadBSS()) && - "wrong symbol type for .bss/.tbss csect"); - // Don't have to print a directive for switching to section for commons and - // zero-initialized TLS data. The '.comm' and '.lcomm' directives of the - // variable will create the needed csect. - return; - } - - // Zero-initialized TLS data with weak or external linkage are not eligible to - // be put into common csect. - if (getKind().isThreadBSS()) { - printCsectDirective(OS); - return; - } - - // XCOFF debug sections. - if (getKind().isMetadata() && isDwarfSect()) { - OS << "\n\t.dwsect " << format("0x%" PRIx32, *getDwarfSubtypeFlags()) - << '\n'; - OS << getName() << ':' << '\n'; - return; - } - - report_fatal_error("Printing for this SectionKind is unimplemented."); -} - -bool MCSectionXCOFF::useCodeAlign() const { return getKind().isText(); } diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index 30198c9..bc73981 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -56,12 +56,11 @@ void MCTargetStreamer::finish() {} void MCTargetStreamer::emitConstantPools() {} -void MCTargetStreamer::changeSection(const MCSection *CurSection, - MCSection *Section, uint32_t Subsection, - raw_ostream &OS) { - Section->printSwitchToSection(*Streamer.getContext().getAsmInfo(), - Streamer.getContext().getTargetTriple(), OS, - Subsection); +void MCTargetStreamer::changeSection(const MCSection *, MCSection *Sec, + uint32_t Subsection, raw_ostream &OS) { + auto &MAI = *Streamer.getContext().getAsmInfo(); + MAI.printSwitchToSection(*Sec, Subsection, + Streamer.getContext().getTargetTriple(), OS); } void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) { @@ -415,7 +414,7 @@ void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { void MCStreamer::emitConditionalAssignment(MCSymbol *Symbol, const MCExpr *Value) {} -void MCStreamer::emitCFISections(bool EH, bool Debug) {} +void MCStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {} void MCStreamer::emitCFIStartProc(bool IsSimple, SMLoc Loc) { if (!FrameInfoStack.empty() && @@ -838,8 +837,8 @@ static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID, if (TextSec == Context.getObjectFileInfo()->getTextSection()) return MainCFISec; - const auto *TextSecCOFF = cast<MCSectionCOFF>(TextSec); - auto *MainCFISecCOFF = cast<MCSectionCOFF>(MainCFISec); + const auto *TextSecCOFF = static_cast<const MCSectionCOFF *>(TextSec); + auto *MainCFISecCOFF = static_cast<MCSectionCOFF *>(MainCFISec); unsigned UniqueID = TextSecCOFF->getOrAssignWinCFISectionID(NextWinCFIID); // If this section is COMDAT, this unwind section should be COMDAT associative @@ -1314,9 +1313,20 @@ void MCStreamer::emitZerofill(MCSection *, MCSymbol *, uint64_t, Align, SMLoc) { } void MCStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size, Align ByteAlignment) {} -void MCStreamer::changeSection(MCSection *Section, uint32_t) { - CurFrag = &Section->getDummyFragment(); + +void MCStreamer::changeSection(MCSection *Sec, uint32_t) { + CurFrag = &Sec->getDummyFragment(); + auto *Sym = Sec->getBeginSymbol(); + if (!Sym || !Sym->isUndefined()) + return; + // In Mach-O, DWARF sections use Begin as a temporary label, requiring a label + // definition, unlike section symbols in other file formats. + if (getContext().getObjectFileType() == MCContext::IsMachO) + emitLabel(Sym); + else + Sym->setFragment(CurFrag); } + void MCStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {} void MCStreamer::emitBytes(StringRef Data) {} void MCStreamer::emitBinaryData(StringRef Data) { emitBytes(Data); } @@ -1358,9 +1368,6 @@ void MCStreamer::switchSection(MCSection *Section, uint32_t Subsection) { changeSection(Section, Subsection); SectionStack.back().first = MCSectionSubPair(Section, Subsection); assert(!Section->hasEnded() && "Section already ended"); - MCSymbol *Sym = Section->getBeginSymbol(); - if (Sym && !Sym->isInSection()) - emitLabel(Sym); } } @@ -1387,9 +1394,6 @@ void MCStreamer::switchSectionNoPrint(MCSection *Section) { SectionStack.back().second = SectionStack.back().first; SectionStack.back().first = MCSectionSubPair(Section, 0); changeSection(Section, 0); - MCSymbol *Sym = Section->getBeginSymbol(); - if (Sym && !Sym->isInSection()) - emitLabel(Sym); } MCSymbol *MCStreamer::endSection(MCSection *Section) { diff --git a/llvm/lib/MC/MCTargetOptions.cpp b/llvm/lib/MC/MCTargetOptions.cpp index bff4b8d..be6d19d 100644 --- a/llvm/lib/MC/MCTargetOptions.cpp +++ b/llvm/lib/MC/MCTargetOptions.cpp @@ -19,7 +19,8 @@ MCTargetOptions::MCTargetOptions() PreserveAsmComments(true), Dwarf64(false), EmitDwarfUnwind(EmitDwarfUnwindType::Default), MCUseDwarfDirectory(DefaultDwarfDirectory), - EmitCompactUnwindNonCanonical(false), PPCUseFullRegisterNames(false) {} + EmitCompactUnwindNonCanonical(false), EmitSFrameUnwind(false), + PPCUseFullRegisterNames(false) {} StringRef MCTargetOptions::getABIName() const { return ABIName; diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp index 2adc291..ff95ff7 100644 --- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp +++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp @@ -41,6 +41,7 @@ MCOPT(int, DwarfVersion) MCOPT(bool, Dwarf64) MCOPT(EmitDwarfUnwindType, EmitDwarfUnwind) MCOPT(bool, EmitCompactUnwindNonCanonical) +MCOPT(bool, EmitSFrameUnwind) MCOPT(bool, ShowMCInst) MCOPT(bool, FatalWarnings) MCOPT(bool, NoWarn) @@ -105,6 +106,11 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() { false)); // By default, use DWARF for non-canonical personalities. MCBINDOPT(EmitCompactUnwindNonCanonical); + static cl::opt<bool> EmitSFrameUnwind( + "gsframe", cl::desc("Whether to emit .sframe unwind sections."), + cl::init(false)); + MCBINDOPT(EmitSFrameUnwind); + static cl::opt<bool> ShowMCInst( "asm-show-inst", cl::desc("Emit internal instruction representation to assembly file")); @@ -188,6 +194,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() { Options.X86Sse2Avx = getX86Sse2Avx(); Options.EmitDwarfUnwind = getEmitDwarfUnwind(); Options.EmitCompactUnwindNonCanonical = getEmitCompactUnwindNonCanonical(); + Options.EmitSFrameUnwind = getEmitSFrameUnwind(); Options.AsSecureLogFile = getAsSecureLogFile(); return Options; diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp index 5891420c..e3ef111 100644 --- a/llvm/lib/MC/MCWasmStreamer.cpp +++ b/llvm/lib/MC/MCWasmStreamer.cpp @@ -58,7 +58,7 @@ void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F, void MCWasmStreamer::changeSection(MCSection *Section, uint32_t Subsection) { MCAssembler &Asm = getAssembler(); - auto *SectionWasm = cast<MCSectionWasm>(Section); + auto *SectionWasm = static_cast<const MCSectionWasm *>(Section); const MCSymbol *Grp = SectionWasm->getGroup(); if (Grp) Asm.registerSymbol(*Grp); diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 9369bea..1ffe25c 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -157,7 +157,8 @@ void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { // Ensure that the first and the second symbols relative to the section are // the section symbol and the COMDAT symbol. getAssembler().registerSymbol(*Section->getBeginSymbol()); - if (auto *Sym = cast<MCSectionCOFF>(Section)->getCOMDATSymbol()) + if (auto *Sym = + static_cast<const MCSectionCOFF *>(Section)->getCOMDATSymbol()) getAssembler().registerSymbol(*Sym); } diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index 63381b4..26f45ce 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -36,6 +36,20 @@ XCOFFObjectWriter &MCXCOFFStreamer::getWriter() { return static_cast<XCOFFObjectWriter &>(getAssembler().getWriter()); } +void MCXCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { + MCObjectStreamer::changeSection(Section, Subsection); + auto *Sec = static_cast<const MCSectionXCOFF *>(Section); + // We might miss calculating the symbols difference as absolute value before + // adding fixups when symbol_A without the fragment set is the csect itself + // and symbol_B is in it. + // TODO: Currently we only set the fragment for XMC_PR csects and DWARF + // sections because we don't have other cases that hit this problem yet. + // if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR) + // QualName->setFragment(F); + if (Sec->isDwarfSect() || Sec->getMappingClass() == XCOFF::XMC_PR) + Sec->getQualNameSymbol()->setFragment(CurFrag); +} + bool MCXCOFFStreamer::emitSymbolAttribute(MCSymbol *Sym, MCSymbolAttr Attribute) { auto *Symbol = cast<MCSymbolXCOFF>(Sym); @@ -89,16 +103,8 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility( void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) { // Add a Fixup here to later record a relocation of type R_REF to prevent the // ref symbol from being garbage collected (by the binder). - MCFragment *DF = getCurrentFragment(); - const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext()); - std::optional<MCFixupKind> MaybeKind = - getAssembler().getBackend().getFixupKind("R_REF"); - if (!MaybeKind) - report_fatal_error("failed to get fixup kind for R_REF relocation"); - - MCFixupKind Kind = *MaybeKind; - MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind); - DF->addFixup(Fixup); + addFixup(MCSymbolRefExpr::create(Symbol, getContext()), + XCOFF::RelocationType::R_REF); } void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name, diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 48d2fc6..e87696a 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -126,7 +126,8 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S) const { uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm, const MCSection *Sec) const { uint64_t EndAddr = getSectionAddress(Sec) + Asm.getSectionAddressSize(*Sec); - unsigned Next = cast<MCSectionMachO>(Sec)->getLayoutOrder() + 1; + unsigned Next = + static_cast<const MCSectionMachO *>(Sec)->getLayoutOrder() + 1; if (Next >= SectionOrder.size()) return 0; @@ -259,15 +260,12 @@ void MachObjectWriter::writeSegmentLoadCommand( } void MachObjectWriter::writeSection(const MCAssembler &Asm, - const MCSection &Sec, uint64_t VMAddr, + const MCSectionMachO &Sec, uint64_t VMAddr, uint64_t FileOffset, unsigned Flags, uint64_t RelocationsStart, unsigned NumRelocations) { - uint64_t SectionSize = Asm.getSectionAddressSize(Sec); - const MCSectionMachO &Section = cast<MCSectionMachO>(Sec); - // The offset is unused for virtual sections. - if (Section.isBssSection()) { + if (Sec.isBssSection()) { assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!"); FileOffset = 0; } @@ -275,11 +273,11 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, // struct section (68 bytes) or // struct section_64 (80 bytes) + uint64_t SectionSize = Asm.getSectionAddressSize(Sec); uint64_t Start = W.OS.tell(); (void) Start; - - writeWithPadding(Section.getName(), 16); - writeWithPadding(Section.getSegmentName(), 16); + writeWithPadding(Sec.getName(), 16); + writeWithPadding(Sec.getSegmentName(), 16); if (is64Bit()) { W.write<uint64_t>(VMAddr); // address W.write<uint64_t>(SectionSize); // size @@ -290,14 +288,14 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, assert(isUInt<32>(FileOffset) && "Cannot encode offset of section"); W.write<uint32_t>(FileOffset); - W.write<uint32_t>(Log2(Section.getAlign())); + W.write<uint32_t>(Log2(Sec.getAlign())); assert((!NumRelocations || isUInt<32>(RelocationsStart)) && "Cannot encode offset of relocations"); W.write<uint32_t>(NumRelocations ? RelocationsStart : 0); W.write<uint32_t>(NumRelocations); W.write<uint32_t>(Flags); W.write<uint32_t>(IndirectSymBase.lookup(&Sec)); // reserved1 - W.write<uint32_t>(Section.getStubSize()); // reserved2 + W.write<uint32_t>(Sec.getStubSize()); // reserved2 if (is64Bit()) W.write<uint32_t>(0); // reserved3 @@ -531,7 +529,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // Report errors for use of .indirect_symbol not in a symbol pointer section // or stub section. for (IndirectSymbolData &ISD : IndirectSymbols) { - const MCSectionMachO &Section = cast<MCSectionMachO>(*ISD.Section); + const MCSectionMachO &Section = static_cast<MCSectionMachO &>(*ISD.Section); if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS && @@ -545,7 +543,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // Bind non-lazy symbol pointers first. for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) { - const auto &Section = cast<MCSectionMachO>(*ISD.Section); + const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section); if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS) @@ -559,7 +557,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // Then lazy symbol pointers and symbol stubs. for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) { - const auto &Section = cast<MCSectionMachO>(*ISD.Section); + const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section); if (Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_SYMBOL_STUBS) @@ -684,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) { for (MCSection &Sec : Asm) { if (!Sec.isBssSection()) { SectionOrder.push_back(&Sec); - cast<MCSectionMachO>(Sec).setLayoutOrder(i++); + static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++); } } for (MCSection &Sec : Asm) { if (Sec.isBssSection()) { SectionOrder.push_back(&Sec); - cast<MCSectionMachO>(Sec).setLayoutOrder(i++); + static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++); } } @@ -808,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() { } MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); - llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data()); + llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data()); } unsigned NumSections = Asm.end() - Asm.begin(); @@ -907,7 +905,7 @@ uint64_t MachObjectWriter::writeObject() { // ... and then the section headers. uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize; for (const MCSection &Section : Asm) { - const auto &Sec = cast<MCSectionMachO>(Section); + const auto &Sec = static_cast<const MCSectionMachO &>(Section); std::vector<RelAndSymbol> &Relocs = Relocations[&Sec]; unsigned NumRelocs = Relocs.size(); uint64_t SectionStart = SectionDataStart + getSectionAddress(&Sec); diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 3b99af4..bfd6334 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -480,7 +480,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F, // The WebAssembly backend should never generate FKF_IsPCRel fixups assert(!Fixup.isPCRel()); - const auto &FixupSection = cast<MCSectionWasm>(*F.getParent()); + const auto &FixupSection = static_cast<MCSectionWasm &>(*F.getParent()); uint64_t C = Target.getConstant(); uint64_t FixupOffset = Asm->getFragmentOffset(F) + Fixup.getOffset(); MCContext &Ctx = getContext(); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 6ad4334..856850d 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -373,7 +373,7 @@ void WinCOFFWriter::defineSymbol(const MCSymbol &MCSym) { COFFSection *Sec = nullptr; MCSectionCOFF *MCSec = nullptr; if (Base && Base->getFragment()) { - MCSec = cast<MCSectionCOFF>(Base->getFragment()->getParent()); + MCSec = static_cast<MCSectionCOFF *>(Base->getFragment()->getParent()); Sec = SectionMap[MCSec]; } @@ -1057,7 +1057,8 @@ uint64_t WinCOFFWriter::writeObject() { continue; } - const auto *AssocMCSec = cast<MCSectionCOFF>(&AssocMCSym->getSection()); + const auto *AssocMCSec = + static_cast<const MCSectionCOFF *>(&AssocMCSym->getSection()); assert(SectionMap.count(AssocMCSec)); COFFSection *AssocSec = SectionMap[AssocMCSec]; diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index 2f6785f..65f543b 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -550,13 +550,13 @@ CsectGroup &XCOFFWriter::getCsectGroup(const MCSectionXCOFF *MCSec) { static MCSectionXCOFF *getContainingCsect(const MCSymbolXCOFF *XSym) { if (XSym->isDefined()) - return cast<MCSectionXCOFF>(XSym->getFragment()->getParent()); + return static_cast<MCSectionXCOFF *>(XSym->getFragment()->getParent()); return XSym->getRepresentedCsect(); } void XCOFFWriter::executePostLayoutBinding() { for (const auto &S : *Asm) { - const auto *MCSec = cast<const MCSectionXCOFF>(&S); + auto *MCSec = static_cast<const MCSectionXCOFF *>(&S); assert(!SectionMap.contains(MCSec) && "Cannot add a section twice."); // If the name does not fit in the storage provided in the symbol table @@ -747,7 +747,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, FixedValue = TOCEntryOffset; } } else if (Type == XCOFF::RelocationType::R_RBR) { - MCSectionXCOFF *ParentSec = cast<MCSectionXCOFF>(F.getParent()); + auto *ParentSec = static_cast<MCSectionXCOFF *>(F.getParent()); assert((SymASec->getMappingClass() == XCOFF::XMC_PR && ParentSec->getMappingClass() == XCOFF::XMC_PR) && "Only XMC_PR csect may have the R_RBR relocation."); @@ -768,7 +768,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, } XCOFFRelocation Reloc = {Index, FixupOffsetInCsect, SignAndSize, Type}; - MCSectionXCOFF *RelocationSec = cast<MCSectionXCOFF>(F.getParent()); + auto *RelocationSec = static_cast<MCSectionXCOFF *>(F.getParent()); assert(SectionMap.contains(RelocationSec) && "Expected containing csect to exist in map."); SectionMap[RelocationSec]->Relocations.push_back(Reloc); diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp index 62a71d4..9b55f76 100644 --- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp @@ -135,7 +135,7 @@ Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const { // it is, find the target section unique id. const coff_aux_section_definition *SD = SymRef.getSectionDefinition(); const coff_aux_weak_external *WE = SymRef.getWeakExternal(); - if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) { + if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE && !Obj.IsPE) { int32_t Index = SD->getNumber(IsBigObj); if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size()) return createStringError(object_error::parse_failed, diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 2579fa3..0f19495 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -8,11 +8,11 @@ #include "llvm/Object/IRSymtab.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Comdat.h" @@ -213,9 +213,10 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } -static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) { - DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols), - std::end(PreservedSymbols)); +static StringSet<> buildPreservedSymbolsSet(const Triple &TT) { + StringSet<> PreservedSymbolSet; + PreservedSymbolSet.insert(std::begin(PreservedSymbols), + std::end(PreservedSymbols)); // FIXME: Do we need to pass in ABI fields from TargetOptions? RTLIB::RuntimeLibcallsInfo Libcalls(TT); for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) { @@ -280,7 +281,7 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, setStr(Sym.IRName, GV->getName()); - static const DenseSet<StringRef> PreservedSymbolsSet = + static const StringSet<> PreservedSymbolsSet = buildPreservedSymbolsSet(GV->getParent()->getTargetTriple()); bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp index 2d74d1d..5863490 100644 --- a/llvm/lib/Object/SFrameParser.cpp +++ b/llvm/lib/Object/SFrameParser.cpp @@ -10,27 +10,41 @@ #include "llvm/BinaryFormat/SFrame.h" #include "llvm/Object/Error.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; using namespace llvm::object; -template <typename T> -static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, - uint64_t Offset) { - static_assert(std::is_trivial_v<T>); - if (Data.size() < Offset + sizeof(T)) { +static Expected<ArrayRef<uint8_t>> +getDataSlice(ArrayRef<uint8_t> Data, uint64_t Offset, uint64_t Size) { + uint64_t End = SaturatingAdd(Offset, Size); + // Data.size() cannot be UINT64_MAX, as it would occupy the whole address + // space. + if (End > Data.size()) { return createStringError( formatv("unexpected end of data at offset {0:x} while reading [{1:x}, " "{2:x})", - Data.size(), Offset, Offset + sizeof(T)) + Data.size(), Offset, End) .str(), object_error::unexpected_eof); } - return *reinterpret_cast<const T *>(Data.data() + Offset); + return Data.slice(Offset, Size); +} + +template <typename T> +static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data, + uint64_t Offset) { + static_assert(std::is_trivial_v<T>); + Expected<ArrayRef<uint8_t>> Slice = getDataSlice(Data, Offset, sizeof(T)); + if (!Slice) + return Slice.takeError(); + + return *reinterpret_cast<const T *>(Slice->data()); } template <endianness E> -Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) { +Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents, + uint64_t SectionAddress) { Expected<const sframe::Preamble<E> &> Preamble = getDataSliceAs<sframe::Preamble<E>>(Contents, 0); if (!Preamble) @@ -48,8 +62,44 @@ Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) { getDataSliceAs<sframe::Header<E>>(Contents, 0); if (!Header) return Header.takeError(); - return SFrameParser(Contents, *Header); + return SFrameParser(Contents, SectionAddress, *Header); +} + +template <endianness E> +Expected<ArrayRef<uint8_t>> SFrameParser<E>::getAuxHeader() const { + return getDataSlice(Data, sizeof(Header), Header.AuxHdrLen); +} + +template <endianness E> +Expected<ArrayRef<sframe::FuncDescEntry<E>>> SFrameParser<E>::fdes() const { + Expected<ArrayRef<uint8_t>> Slice = getDataSlice( + Data, getFDEBase(), Header.NumFDEs * sizeof(sframe::FuncDescEntry<E>)); + if (!Slice) + return Slice.takeError(); + return ArrayRef( + reinterpret_cast<const sframe::FuncDescEntry<E> *>(Slice->data()), + Header.NumFDEs); +} + +template <endianness E> +uint64_t SFrameParser<E>::getAbsoluteStartAddress( + typename FDERange::iterator FDE) const { + uint64_t Result = SectionAddress + FDE->StartAddress; + + if ((getPreamble().Flags.value() & sframe::Flags::FDEFuncStartPCRel) == + sframe::Flags::FDEFuncStartPCRel) { + uintptr_t DataPtr = reinterpret_cast<uintptr_t>(Data.data()); + uintptr_t FDEPtr = reinterpret_cast<uintptr_t>(&*FDE); + + assert(DataPtr <= FDEPtr && FDEPtr < DataPtr + Data.size() && + "Iterator does not belong to this object!"); + + Result += FDEPtr - DataPtr; + } + + return Result; } -template class llvm::object::SFrameParser<endianness::big>; -template class llvm::object::SFrameParser<endianness::little>; +template class LLVM_EXPORT_TEMPLATE llvm::object::SFrameParser<endianness::big>; +template class LLVM_EXPORT_TEMPLATE + llvm::object::SFrameParser<endianness::little>; diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index bb7ccdb..1b111dc 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -84,6 +84,7 @@ MODULE_PASS("global-merge-func", GlobalMergeFuncPass()) MODULE_PASS("globalopt", GlobalOptPass()) MODULE_PASS("globalsplit", GlobalSplitPass()) MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass()) +MODULE_PASS("hipstdpar-math-fixup", HipStdParMathFixupPass()) MODULE_PASS("hipstdpar-select-accelerator-code", HipStdParAcceleratorCodeSelectionPass()) MODULE_PASS("hotcoldsplit", HotColdSplittingPass()) @@ -119,7 +120,6 @@ MODULE_PASS("module-inline", ModuleInlinerPass()) MODULE_PASS("name-anon-globals", NameAnonGlobalPass()) MODULE_PASS("no-op-module", NoOpModulePass()) MODULE_PASS("nsan", NumericalStabilitySanitizerPass()) -MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass()) MODULE_PASS("openmp-opt", OpenMPOptPass()) MODULE_PASS("openmp-opt-postlink", OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)) diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 5c7b9e0..886add7 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1295,7 +1295,7 @@ Error IndexedInstrProfReader::readHeader() { // Writer first writes the length of compressed string, and then the actual // content. const char *VTableNamePtr = (const char *)Ptr; - if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd()) + if (VTableNamePtr > DataBuffer->getBufferEnd()) return make_error<InstrProfError>(instrprof_error::truncated); VTableName = StringRef(VTableNamePtr, CompressedVTableNamesLen); diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp index 235b134..3fc0dbf 100644 --- a/llvm/lib/ProfileData/MemProfReader.cpp +++ b/llvm/lib/ProfileData/MemProfReader.cpp @@ -135,7 +135,7 @@ readMemInfoBlocksV3(const char *Ptr) { } llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> -readMemInfoBlocksV4(const char *Ptr) { +readMemInfoBlocksCommon(const char *Ptr, bool IsHistogramEncoded = false) { using namespace support; const uint64_t NumItemsToRead = @@ -145,27 +145,74 @@ readMemInfoBlocksV4(const char *Ptr) { for (uint64_t I = 0; I < NumItemsToRead; I++) { const uint64_t Id = endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr); - // We cheat a bit here and remove the const from cast to set the - // Histogram Pointer to newly allocated buffer. - MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr); - // Only increment by size of MIB since readNext implicitly increments. - Ptr += sizeof(MemInfoBlock); + MemInfoBlock MIB; +#define READ_MIB_FIELD(FIELD) \ + MIB.FIELD = endian::readNext<decltype(MIB.FIELD), llvm::endianness::little, \ + unaligned>(Ptr) + + READ_MIB_FIELD(AllocCount); + READ_MIB_FIELD(TotalAccessCount); + READ_MIB_FIELD(MinAccessCount); + READ_MIB_FIELD(MaxAccessCount); + READ_MIB_FIELD(TotalSize); + READ_MIB_FIELD(MinSize); + READ_MIB_FIELD(MaxSize); + READ_MIB_FIELD(AllocTimestamp); + READ_MIB_FIELD(DeallocTimestamp); + READ_MIB_FIELD(TotalLifetime); + READ_MIB_FIELD(MinLifetime); + READ_MIB_FIELD(MaxLifetime); + READ_MIB_FIELD(AllocCpuId); + READ_MIB_FIELD(DeallocCpuId); + READ_MIB_FIELD(NumMigratedCpu); + READ_MIB_FIELD(NumLifetimeOverlaps); + READ_MIB_FIELD(NumSameAllocCpu); + READ_MIB_FIELD(NumSameDeallocCpu); + READ_MIB_FIELD(DataTypeId); + READ_MIB_FIELD(TotalAccessDensity); + READ_MIB_FIELD(MinAccessDensity); + READ_MIB_FIELD(MaxAccessDensity); + READ_MIB_FIELD(TotalLifetimeAccessDensity); + READ_MIB_FIELD(MinLifetimeAccessDensity); + READ_MIB_FIELD(MaxLifetimeAccessDensity); + READ_MIB_FIELD(AccessHistogramSize); + READ_MIB_FIELD(AccessHistogram); +#undef READ_MIB_FIELD if (MIB.AccessHistogramSize > 0) { + // The in-memory representation uses uint64_t for histogram entries. MIB.AccessHistogram = (uintptr_t)malloc(MIB.AccessHistogramSize * sizeof(uint64_t)); - } - - for (uint64_t J = 0; J < MIB.AccessHistogramSize; J++) { - ((uint64_t *)MIB.AccessHistogram)[J] = - endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr); + for (uint64_t J = 0; J < MIB.AccessHistogramSize; J++) { + if (!IsHistogramEncoded) { + ((uint64_t *)MIB.AccessHistogram)[J] = + endian::readNext<uint64_t, llvm::endianness::little, unaligned>( + Ptr); + } else { + // The encoded on-disk format (V5 onwards) uses uint16_t. + const uint16_t Val = + endian::readNext<uint16_t, llvm::endianness::little, unaligned>( + Ptr); + ((uint64_t *)MIB.AccessHistogram)[J] = decodeHistogramCount(Val); + } + } } Items.push_back({Id, MIB}); } return Items; } +llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> +readMemInfoBlocksV4(const char *Ptr) { + return readMemInfoBlocksCommon(Ptr); +} + +llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> +readMemInfoBlocksV5(const char *Ptr) { + return readMemInfoBlocksCommon(Ptr, /*IsHistogramEncoded=*/true); +} + CallStackMap readStackInfo(const char *Ptr) { using namespace support; @@ -658,6 +705,8 @@ RawMemProfReader::readMemInfoBlocks(const char *Ptr) { return readMemInfoBlocksV3(Ptr); if (MemprofRawVersion == 4ULL) return readMemInfoBlocksV4(Ptr); + if (MemprofRawVersion == 5ULL) + return readMemInfoBlocksV5(Ptr); llvm_unreachable( "Panic: Unsupported version number when reading MemInfoBlocks"); } diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index d5c3cba..8491633 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -68,11 +68,19 @@ template class LLVM_EXPORT_TEMPLATE basic_parser<float>; template class LLVM_EXPORT_TEMPLATE basic_parser<std::string>; template class LLVM_EXPORT_TEMPLATE basic_parser<char>; -template class opt<unsigned>; -template class opt<int>; -template class opt<std::string>; -template class opt<char>; -template class opt<bool>; +#if !(defined(LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS) && defined(_MSC_VER)) +// Only instantiate opt<std::string> when not building a Windows DLL. When +// exporting opt<std::string>, MSVC implicitly exports symbols for +// std::basic_string through transitive inheritance via std::string. These +// symbols may appear in clients, leading to duplicate symbol conflicts. +template class LLVM_EXPORT_TEMPLATE opt<std::string>; +#endif + +template class LLVM_EXPORT_TEMPLATE opt<bool>; +template class LLVM_EXPORT_TEMPLATE opt<char>; +template class LLVM_EXPORT_TEMPLATE opt<int>; +template class LLVM_EXPORT_TEMPLATE opt<unsigned>; + } // namespace cl } // namespace llvm @@ -95,6 +103,15 @@ void parser<float>::anchor() {} void parser<std::string>::anchor() {} void parser<char>::anchor() {} +// These anchor functions instantiate opt<T> and reference its virtual +// destructor to ensure MSVC exports the corresponding vtable and typeinfo when +// building a Windows DLL. Without an explicit reference, MSVC may omit the +// instantiation at link time even if it is marked DLL-export. +void opt_bool_anchor() { opt<bool> anchor{""}; } +void opt_char_anchor() { opt<char> anchor{""}; } +void opt_int_anchor() { opt<int> anchor{""}; } +void opt_unsigned_anchor() { opt<unsigned> anchor{""}; } + //===----------------------------------------------------------------------===// const static size_t DefaultPad = 2; diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp index 5bb04d0..b6f338f 100644 --- a/llvm/lib/Support/Debug.cpp +++ b/llvm/lib/Support/Debug.cpp @@ -24,11 +24,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/Debug.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Signals.h" #include "llvm/Support/circular_raw_ostream.h" #include "llvm/Support/raw_ostream.h" +#include <utility> #include "DebugOptions.h" @@ -38,27 +40,62 @@ using namespace llvm; +/// Parse a debug type string into a pair of the debug type and the debug level. +/// The expected format is "type[:level]", where the level is an optional +/// integer. +static std::pair<std::string, std::optional<int>> +parseDebugType(StringRef DbgType) { + std::optional<int> Level; + size_t ColonPos = DbgType.find(':'); + if (ColonPos != StringRef::npos) { + StringRef LevelStr = DbgType.substr(ColonPos + 1); + DbgType = DbgType.take_front(ColonPos); + if (LevelStr.empty()) + Level = 0; + else { + int parsedLevel; + if (to_integer(LevelStr, parsedLevel, 10)) + Level = parsedLevel; + } + } + return std::make_pair(DbgType.str(), Level); +} + // Even though LLVM might be built with NDEBUG, define symbols that the code // built without NDEBUG can depend on via the llvm/Support/Debug.h header. namespace llvm { /// Exported boolean set by the -debug option. bool DebugFlag = false; -static ManagedStatic<std::vector<std::string>> CurrentDebugType; +/// The current debug type and an optional debug level. +/// The debug level is the verbosity of the debug output. +/// 0 is a special level that acts as an opt-out for this specific debug type. +/// If provided, the debug output is enabled only if the user specified a level +/// at least as high as the provided level. +static ManagedStatic<std::vector<std::pair<std::string, std::optional<int>>>> + CurrentDebugType; /// Return true if the specified string is the debug type /// specified on the command line, or if none was specified on the command line /// with the -debug-only=X option. -bool isCurrentDebugType(const char *DebugType) { +bool isCurrentDebugType(const char *DebugType, int Level) { if (CurrentDebugType->empty()) return true; + // Track if there is at least one debug type with a level, this is used + // to allow to opt-out of some DebugType and leaving all the others enabled. + bool HasEnabledDebugType = false; // See if DebugType is in list. Note: do not use find() as that forces us to // unnecessarily create an std::string instance. - for (auto &d : *CurrentDebugType) { - if (d == DebugType) + for (auto &D : *CurrentDebugType) { + HasEnabledDebugType = + HasEnabledDebugType || (!D.second.has_value() || D.second.value() > 0); + if (D.first != DebugType) + continue; + if (!D.second.has_value()) return true; + return D.second >= Level; } - return false; + return !HasEnabledDebugType; } /// Set the current debug type, as if the -debug-only=X @@ -73,8 +110,11 @@ void setCurrentDebugType(const char *Type) { void setCurrentDebugTypes(const char **Types, unsigned Count) { CurrentDebugType->clear(); - llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count)); + CurrentDebugType->reserve(Count); + for (const char *Type : ArrayRef(Types, Count)) + CurrentDebugType->push_back(parseDebugType(Type)); } + } // namespace llvm // All Debug.h functionality is a no-op in NDEBUG mode. @@ -114,10 +154,10 @@ struct DebugOnlyOpt { if (Val.empty()) return; DebugFlag = true; - SmallVector<StringRef,8> dbgTypes; - StringRef(Val).split(dbgTypes, ',', -1, false); - for (auto dbgType : dbgTypes) - CurrentDebugType->push_back(std::string(dbgType)); + SmallVector<StringRef, 8> DbgTypes; + StringRef(Val).split(DbgTypes, ',', -1, false); + for (auto DbgType : DbgTypes) + CurrentDebugType->push_back(parseDebugType(DbgType)); } }; } // namespace @@ -129,8 +169,13 @@ struct CreateDebugOnly { static void *call() { return new cl::opt<DebugOnlyOpt, true, cl::parser<std::string>>( "debug-only", - cl::desc("Enable a specific type of debug output (comma separated list " - "of types)"), + cl::desc( + "Enable a specific type of debug output (comma separated list " + "of types using the format \"type[:level]\", where the level " + "is an optional integer. The level can be set to 1, 2, 3, etc. to " + "control the verbosity of the output. Setting a debug-type level " + "to zero acts as an opt-out for this specific debug-type without " + "affecting the others."), cl::Hidden, cl::value_desc("debug string"), cl::location(DebugOnlyOptLoc), cl::ValueRequired); } diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index 277247e..cc02cae 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -1190,7 +1190,7 @@ Expected<size_t> readNativeFile(file_t FD, MutableArrayRef<char> Buf) { size_t Size = Buf.size(); #endif ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Size); - if (ssize_t(NumRead) == -1) + if (NumRead == -1) return errorCodeToError(errnoAsErrorCode()); // The underlying operation on these platforms allow opening directories // for reading in more cases than other platforms. diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc index d862dbd..8dd7c88 100644 --- a/llvm/lib/Support/Windows/Threading.inc +++ b/llvm/lib/Support/Windows/Threading.inc @@ -106,7 +106,67 @@ void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); } +namespace llvm::sys::windows { +HMODULE loadSystemModuleSecure(LPCWSTR lpModuleName) { + // Ensure we load indeed a module from system32 path. + // As per GetModuleHandle documentation: + // "If lpModuleName does not include a path and there is more than one loaded + // module with the same base name and extension, you cannot predict which + // module handle will be returned.". This mitigates + // https://learn.microsoft.com/en-us/security-updates/securityadvisories/2010/2269637 + SmallVector<wchar_t, MAX_PATH> Buf; + size_t Size = MAX_PATH; + do { + Buf.resize_for_overwrite(Size); + SetLastError(NO_ERROR); + Size = ::GetSystemDirectoryW(Buf.data(), Buf.size()); + if (Size == 0) + return NULL; + + // Try again with larger buffer. + } while (Size > Buf.size()); + + Buf.truncate(Size); + Buf.push_back(L'\\'); + Buf.append(lpModuleName, lpModuleName + std::wcslen(lpModuleName)); + Buf.push_back(0); + + return ::GetModuleHandleW(Buf.data()); +} +} // namespace llvm::sys::windows + SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) { + HMODULE kernelM = llvm::sys::windows::loadSystemModuleSecure(L"kernel32.dll"); + if (kernelM) { + // SetThreadInformation is only available on Windows 8 and later. Since we + // still support compilation on Windows 7, we load the function dynamically. + typedef BOOL(WINAPI * SetThreadInformation_t)( + HANDLE hThread, THREAD_INFORMATION_CLASS ThreadInformationClass, + _In_reads_bytes_(ThreadInformationSize) PVOID ThreadInformation, + ULONG ThreadInformationSize); + static const auto pfnSetThreadInformation = + (SetThreadInformation_t)::GetProcAddress(kernelM, + "SetThreadInformation"); + if (pfnSetThreadInformation) { + auto setThreadInformation = [](ULONG ControlMaskAndStateMask) { + THREAD_POWER_THROTTLING_STATE state{}; + state.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; + state.ControlMask = ControlMaskAndStateMask; + state.StateMask = ControlMaskAndStateMask; + return pfnSetThreadInformation( + ::GetCurrentThread(), ThreadPowerThrottling, &state, sizeof(state)); + }; + + // Use EcoQoS for ThreadPriority::Background available (running on most + // efficent cores at the most efficient cpu frequency): + // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadinformation + // https://learn.microsoft.com/en-us/windows/win32/procthread/quality-of-service + setThreadInformation(Priority == ThreadPriority::Background + ? THREAD_POWER_THROTTLING_EXECUTION_SPEED + : 0); + } + } + // https://docs.microsoft.com/en-us/windows/desktop/api/processthreadsapi/nf-processthreadsapi-setthreadpriority // Begin background processing mode. The system lowers the resource scheduling // priorities of the thread so that it can perform background work without diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 1f3e5dc..3f318e2 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -985,6 +985,12 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { } break; + case GETDAGOPNAME: + if (const auto *Dag = dyn_cast<DagInit>(LHS)) { + return Dag->getName(); + } + break; + case LOG2: if (const auto *LHSi = dyn_cast_or_null<IntInit>( LHS->convertInitializerTo(IntRecTy::get(RK)))) { @@ -1050,6 +1056,9 @@ std::string UnOpInit::getAsString() const { case SIZE: Result = "!size"; break; case EMPTY: Result = "!empty"; break; case GETDAGOP: Result = "!getdagop"; break; + case GETDAGOPNAME: + Result = "!getdagopname"; + break; case LOG2 : Result = "!logtwo"; break; case LISTFLATTEN: Result = "!listflatten"; @@ -1310,7 +1319,11 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { SmallVector<std::pair<const Init *, const StringInit *>, 8> Args; llvm::append_range(Args, LHSs->getArgAndNames()); llvm::append_range(Args, RHSs->getArgAndNames()); - return DagInit::get(Op, Args); + // Use the name of the LHS DAG if it's set, otherwise the name of the RHS. + const auto *NameInit = LHSs->getName(); + if (!NameInit) + NameInit = RHSs->getName(); + return DagInit::get(Op, NameInit, Args); } break; } @@ -1508,6 +1521,14 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames()); break; } + case SETDAGOPNAME: { + const auto *Dag = dyn_cast<DagInit>(LHS); + const auto *Op = dyn_cast<StringInit>(RHS); + if (Dag && Op) + return DagInit::get(Dag->getOperator(), Op, Dag->getArgs(), + Dag->getArgNames()); + break; + } case ADD: case SUB: case MUL: @@ -1620,6 +1641,9 @@ std::string BinOpInit::getAsString() const { case STRCONCAT: Result = "!strconcat"; break; case INTERLEAVE: Result = "!interleave"; break; case SETDAGOP: Result = "!setdagop"; break; + case SETDAGOPNAME: + Result = "!setdagopname"; + break; case GETDAGARG: Result = "!getdagarg<" + getType()->getAsString() + ">"; break; diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index aea1bb0..c369916 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -680,6 +680,8 @@ tgtok::TokKind TGLexer::LexExclaim() { .Case("find", tgtok::XFind) .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated. .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated. + .Case("setdagopname", tgtok::XSetDagOpName) + .Case("getdagopname", tgtok::XGetDagOpName) .Case("getdagarg", tgtok::XGetDagArg) .Case("getdagname", tgtok::XGetDagName) .Case("setdagarg", tgtok::XSetDagArg) diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h index ed7d8f3..5725e39 100644 --- a/llvm/lib/TableGen/TGLexer.h +++ b/llvm/lib/TableGen/TGLexer.h @@ -150,6 +150,8 @@ enum TokKind { XGt, XSetDagOp, XGetDagOp, + XSetDagOpName, + XGetDagOpName, XExists, XListRemove, XToLower, diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 62c5355..81b61b1 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "TGParser.h" +#include "TGLexer.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" @@ -1199,6 +1200,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XCast: case tgtok::XRepr: case tgtok::XGetDagOp: + case tgtok::XGetDagOpName: case tgtok::XInitialized: { // Value ::= !unop '(' Value ')' UnOpInit::UnaryOp Code; const RecTy *Type = nullptr; @@ -1287,6 +1289,11 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } Code = UnOpInit::GETDAGOP; break; + case tgtok::XGetDagOpName: + Lex.Lex(); // eat the operation + Type = StringRecTy::get(Records); + Code = UnOpInit::GETDAGOPNAME; + break; case tgtok::XInitialized: Lex.Lex(); // eat the operation Code = UnOpInit::INITIALIZED; @@ -1514,7 +1521,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XInterleave: case tgtok::XGetDagArg: case tgtok::XGetDagName: - case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')' + case tgtok::XSetDagOp: + case tgtok::XSetDagOpName: { // Value ::= !binop '(' Value ',' Value ')' tgtok::TokKind OpTok = Lex.getCode(); SMLoc OpLoc = Lex.getLoc(); Lex.Lex(); // eat the operation @@ -1550,6 +1558,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { case tgtok::XStrConcat: Code = BinOpInit::STRCONCAT; break; case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break; case tgtok::XSetDagOp: Code = BinOpInit::SETDAGOP; break; + case tgtok::XSetDagOpName: + Code = BinOpInit::SETDAGOPNAME; + break; case tgtok::XGetDagArg: Code = BinOpInit::GETDAGARG; break; @@ -1580,6 +1591,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { } ArgType = DagRecTy::get(Records); break; + case tgtok::XSetDagOpName: + Type = DagRecTy::get(Records); + ArgType = DagRecTy::get(Records); + break; case tgtok::XGetDagName: Type = StringRecTy::get(Records); ArgType = DagRecTy::get(Records); @@ -1773,22 +1788,26 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) { // Deal with BinOps whose arguments have different types, by // rewriting ArgType in between them. switch (Code) { - case BinOpInit::SETDAGOP: - // After parsing the first dag argument, switch to expecting - // a record, with no restriction on its superclasses. - ArgType = RecordRecTy::get(Records, {}); - break; - case BinOpInit::GETDAGARG: - // After parsing the first dag argument, expect an index integer or a - // name string. - ArgType = nullptr; - break; - case BinOpInit::GETDAGNAME: - // After parsing the first dag argument, expect an index integer. - ArgType = IntRecTy::get(Records); - break; - default: - break; + case BinOpInit::SETDAGOPNAME: + // After parsing the first dag argument, expect a string. + ArgType = StringRecTy::get(Records); + break; + case BinOpInit::SETDAGOP: + // After parsing the first dag argument, switch to expecting + // a record, with no restriction on its superclasses. + ArgType = RecordRecTy::get(Records, {}); + break; + case BinOpInit::GETDAGARG: + // After parsing the first dag argument, expect an index integer or a + // name string. + ArgType = nullptr; + break; + case BinOpInit::GETDAGNAME: + // After parsing the first dag argument, expect an index integer. + ArgType = IntRecTy::get(Records); + break; + default: + break; } if (!consume(tgtok::comma)) diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index c4b43e1..c52487a 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -176,6 +176,9 @@ public: std::optional<AArch64PACKey::ID> PACKey, uint64_t PACDisc, Register PACAddrDisc); + // Emit the sequence for PAC. + void emitPtrauthSign(const MachineInstr *MI); + // Emit the sequence to compute the discriminator. // // The returned register is either unmodified AddrDisc or ScratchReg. @@ -2175,6 +2178,37 @@ void AArch64AsmPrinter::emitPtrauthAuthResign( OutStreamer->emitLabel(EndSym); } +void AArch64AsmPrinter::emitPtrauthSign(const MachineInstr *MI) { + Register Val = MI->getOperand(1).getReg(); + auto Key = (AArch64PACKey::ID)MI->getOperand(2).getImm(); + uint64_t Disc = MI->getOperand(3).getImm(); + Register AddrDisc = MI->getOperand(4).getReg(); + bool AddrDiscKilled = MI->getOperand(4).isKill(); + + // As long as at least one of Val and AddrDisc is in GPR64noip, a scratch + // register is available. + Register ScratchReg = Val == AArch64::X16 ? AArch64::X17 : AArch64::X16; + assert(ScratchReg != AddrDisc && + "Neither X16 nor X17 is available as a scratch register"); + + // Compute pac discriminator + assert(isUInt<16>(Disc)); + Register DiscReg = emitPtrauthDiscriminator( + Disc, AddrDisc, ScratchReg, /*MayUseAddrAsScratch=*/AddrDiscKilled); + bool IsZeroDisc = DiscReg == AArch64::XZR; + unsigned Opc = getPACOpcodeForKey(Key, IsZeroDisc); + + // paciza x16 ; if IsZeroDisc + // pacia x16, x17 ; if !IsZeroDisc + MCInst PACInst; + PACInst.setOpcode(Opc); + PACInst.addOperand(MCOperand::createReg(Val)); + PACInst.addOperand(MCOperand::createReg(Val)); + if (!IsZeroDisc) + PACInst.addOperand(MCOperand::createReg(DiscReg)); + EmitToStreamer(*OutStreamer, PACInst); +} + void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) { bool IsCall = MI->getOpcode() == AArch64::BLRA; unsigned BrTarget = MI->getOperand(0).getReg(); @@ -2890,6 +2924,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { MI->getOperand(4).getImm(), MI->getOperand(5).getReg()); return; + case AArch64::PAC: + emitPtrauthSign(MI); + return; + case AArch64::LOADauthptrstatic: LowerLOADauthptrstatic(*MI); return; diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ca09598..99f0af5 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -39,8 +39,8 @@ let Predicates = [HasDotProd] in { def ext_addv_to_udot_addv : GICombineRule< (defs root:$root, ext_addv_to_udot_addv_matchinfo:$matchinfo), (match (wip_match_opcode G_VECREDUCE_ADD):$root, - [{ return matchExtAddvToUdotAddv(*${root}, MRI, STI, ${matchinfo}); }]), - (apply [{ applyExtAddvToUdotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }]) + [{ return matchExtAddvToDotAddv(*${root}, MRI, STI, ${matchinfo}); }]), + (apply [{ applyExtAddvToDotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }]) >; } @@ -62,8 +62,10 @@ class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICom def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>; def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>; +def push_mul_through_zext : push_opcode_through_ext<G_MUL, G_ZEXT>; def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>; def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>; +def push_mul_through_sext : push_opcode_through_ext<G_MUL, G_SEXT>; def AArch64PreLegalizerCombiner: GICombiner< "AArch64PreLegalizerCombinerImpl", [all_combines, @@ -75,8 +77,10 @@ def AArch64PreLegalizerCombiner: GICombiner< ext_uaddv_to_uaddlv, push_sub_through_zext, push_add_through_zext, + push_mul_through_zext, push_sub_through_sext, - push_add_through_sext]> { + push_add_through_sext, + push_mul_through_sext]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index eca7ca5..ad42f4b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5296,7 +5296,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5307,7 +5307,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5317,7 +5317,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5327,7 +5327,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5341,7 +5341,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ld1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5352,7 +5352,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5362,7 +5362,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5372,7 +5372,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad( Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO); else if (Subtarget->hasSVE2p1()) @@ -5386,7 +5386,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x2: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM_PSEUDO, AArch64::LDNT1B_2Z_PSEUDO); @@ -5398,7 +5398,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM_PSEUDO, AArch64::LDNT1H_2Z_PSEUDO); @@ -5409,7 +5409,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM_PSEUDO, AArch64::LDNT1W_2Z_PSEUDO); @@ -5420,7 +5420,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM_PSEUDO, AArch64::LDNT1D_2Z_PSEUDO); @@ -5435,7 +5435,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_ldnt1_pn_x4: { if (VT == MVT::nxv16i8) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM_PSEUDO, AArch64::LDNT1B_4Z_PSEUDO); @@ -5447,7 +5447,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM_PSEUDO, AArch64::LDNT1H_4Z_PSEUDO); @@ -5458,7 +5458,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM_PSEUDO, AArch64::LDNT1W_4Z_PSEUDO); @@ -5469,7 +5469,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - if (Subtarget->hasSME2()) + if (Subtarget->hasSME2() && Subtarget->isStreaming()) SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM_PSEUDO, AArch64::LDNT1D_4Z_PSEUDO); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f2b69ed..4f6e3dd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3101,6 +3101,83 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI, return BB; } +// Helper function to find the instruction that defined a virtual register. +// If unable to find such instruction, returns nullptr. +static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI, + Register Reg) { + while (Reg.isVirtual()) { + MachineInstr *DefMI = MRI.getVRegDef(Reg); + assert(DefMI && "Virtual register definition not found"); + unsigned Opcode = DefMI->getOpcode(); + + if (Opcode == AArch64::COPY) { + Reg = DefMI->getOperand(1).getReg(); + // Vreg is defined by copying from physreg. + if (Reg.isPhysical()) + return DefMI; + continue; + } + if (Opcode == AArch64::SUBREG_TO_REG) { + Reg = DefMI->getOperand(2).getReg(); + continue; + } + + return DefMI; + } + return nullptr; +} + +void AArch64TargetLowering::fixupPtrauthDiscriminator( + MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, + MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + + Register AddrDisc = AddrDiscOp.getReg(); + int64_t IntDisc = IntDiscOp.getImm(); + assert(IntDisc == 0 && "Blend components are already expanded"); + + const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc); + if (DiscMI) { + switch (DiscMI->getOpcode()) { + case AArch64::MOVKXi: + // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48". + // #imm should be an immediate and not a global symbol, for example. + if (DiscMI->getOperand(2).isImm() && + DiscMI->getOperand(3).getImm() == 48) { + AddrDisc = DiscMI->getOperand(1).getReg(); + IntDisc = DiscMI->getOperand(2).getImm(); + } + break; + case AArch64::MOVi32imm: + case AArch64::MOVi64imm: + // Small immediate integer constant passed via VReg. + if (DiscMI->getOperand(1).isImm() && + isUInt<16>(DiscMI->getOperand(1).getImm())) { + AddrDisc = AArch64::NoRegister; + IntDisc = DiscMI->getOperand(1).getImm(); + } + break; + } + } + + // For uniformity, always use NoRegister, as XZR is not necessarily contained + // in the requested register class. + if (AddrDisc == AArch64::XZR) + AddrDisc = AArch64::NoRegister; + + // Make sure AddrDisc operand respects the register class imposed by MI. + if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) { + Register TmpReg = MRI.createVirtualRegister(AddrDiscRC); + BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc); + AddrDisc = TmpReg; + } + + AddrDiscOp.setReg(AddrDisc); + IntDiscOp.setImm(IntDisc); +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3199,6 +3276,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true); case AArch64::MOVT_TIZ_PSEUDO: return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true); + + case AArch64::PAC: + fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4), + &AArch64::GPR64noipRegClass); + return BB; } } @@ -4223,7 +4305,6 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); SDLoc DL(Op); - SDVTList VTs = DAG.getVTList(VT0, VT1); SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS, OpRHS, OpCarryIn); @@ -4232,7 +4313,7 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); - return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); + return DAG.getMergeValues({Sum, OutFlag}, DL); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { @@ -4257,8 +4338,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { Overflow = DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Value, Overflow); + return DAG.getMergeValues({Value, Overflow}, DL); } // Prefetch operands are: @@ -6816,7 +6896,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), - {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo), + DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()}, StoreNode->getMemoryVT(), StoreNode->getMemOperand()); return Result; } @@ -8871,6 +8952,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; CallingConv::ID &CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); MachineFunction::CallSiteInfo CSInfo; @@ -8910,6 +8992,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, *DAG.getContext()); RetCCInfo.AnalyzeCallResult(Ins, RetCC); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + // Check callee args/returns for SVE registers and set calling convention // accordingly. if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { @@ -11244,7 +11330,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue AArch64TargetLowering::LowerSELECT_CC( ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, - iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs, + iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags, const SDLoc &DL, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -11305,6 +11391,22 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Canonicalise absolute difference patterns: + // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> + // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc + // + // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc -> + // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc + // The second forms can be matched into subs+cneg. + if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) { + if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS && + FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) + FVal = DAG.getNegative(TVal, DL, TVal.getValueType()); + else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS && + FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) + TVal = DAG.getNegative(FVal, DL, FVal.getValueType()); + } + unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in @@ -11442,7 +11544,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return true; } })) { - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs; + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs(); SDValue VectorCmp = emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG); if (VectorCmp) @@ -11456,7 +11558,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); - if (DAG.getTarget().Options.UnsafeFPMath) { + if (Flags.hasNoSignedZeros()) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); @@ -11535,10 +11637,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); - bool HasNoNans = Op->getFlags().hasNoNaNs(); + SDNodeFlags Flags = Op->getFlags(); SDLoc DL(Op); - return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, - DAG); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, @@ -11546,7 +11647,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); - bool HasNoNans = Op->getFlags().hasNoNaNs(); SDLoc DL(Op); EVT Ty = Op.getValueType(); @@ -11613,8 +11713,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, DAG.getUNDEF(MVT::f32), FVal); } - SDValue Res = - LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG); + SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), + Op->getFlags(), DL, DAG); if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); @@ -12211,7 +12311,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags = SDNodeFlags::AllowReassociation; + // Ensure nodes can be recognized by isAssociativeAndCommutative. + SDNodeFlags Flags = + SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros; // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) @@ -16593,7 +16695,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath)); + I->getFastMathFlags().allowContract())); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -24031,6 +24133,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, Store->getMemOperand()); } +// Combine store (fp_to_int X) to use vector semantics around the conversion +// when NEON is available. This allows us to store the in-vector result directly +// without transferring the result into a GPR in the process. +static SDValue combineStoreValueFPToInt(StoreSDNode *ST, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + // Limit to post-legalization in order to avoid peeling truncating stores. + if (DCI.isBeforeLegalize()) + return SDValue(); + if (!Subtarget->isNeonAvailable()) + return SDValue(); + // Source operand is already a vector. + SDValue Value = ST->getValue(); + if (Value.getValueType().isVector()) + return SDValue(); + + // Look through potential assertions. + while (Value->isAssert()) + Value = Value.getOperand(0); + + if (Value.getOpcode() != ISD::FP_TO_SINT && + Value.getOpcode() != ISD::FP_TO_UINT) + return SDValue(); + if (!Value->hasOneUse()) + return SDValue(); + + SDValue FPSrc = Value.getOperand(0); + EVT SrcVT = FPSrc.getValueType(); + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return SDValue(); + + // No support for assignments such as i64 = fp_to_sint i32 + EVT VT = Value.getSimpleValueType(); + if (VT != SrcVT.changeTypeToInteger()) + return SDValue(); + + // Create a 128-bit element vector to avoid widening. The floating point + // conversion is transformed into a single element conversion via a pattern. + unsigned NumElements = 128 / SrcVT.getFixedSizeInBits(); + EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements); + EVT VecDstVT = VecSrcVT.changeTypeToInteger(); + SDLoc DL(ST); + SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc); + SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP); + + SDValue Zero = DAG.getVectorIdxConstant(0, DL); + SDValue Extracted = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero); + + DCI.CombineTo(ST->getValue().getNode(), Extracted); + return SDValue(ST, 0); +} + bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) || (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) || @@ -24113,6 +24269,9 @@ static SDValue performSTORECombine(SDNode *N, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(ST); + if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget)) + return Res; + auto hasValidElementTypeForFPTruncStore = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; @@ -26845,6 +27004,23 @@ static SDValue performSHLCombine(SDNode *N, return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS); } +static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) { + unsigned IntrinsicID = N->getConstantOperandVal(1); + auto Register = + (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR + : AArch64SysReg::RNDRRS); + SDLoc DL(N); + SDValue A = DAG.getNode( + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), + N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); + SDValue B = DAG.getNode( + AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); + return DAG.getMergeValues( + {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -27160,22 +27336,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); case Intrinsic::aarch64_rndr: - case Intrinsic::aarch64_rndrrs: { - unsigned IntrinsicID = N->getConstantOperandVal(1); - auto Register = - (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR - : AArch64SysReg::RNDRRS); - SDLoc DL(N); - SDValue A = DAG.getNode( - AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), - N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); - SDValue B = DAG.getNode( - AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); - return DAG.getMergeValues( - {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); - } + case Intrinsic::aarch64_rndrrs: + return performRNDRCombine(N, DAG); case Intrinsic::aarch64_sme_ldr_zt: return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N), DAG.getVTList(MVT::Other), N->getOperand(0), @@ -27913,16 +28075,16 @@ void AArch64TargetLowering::ReplaceNodeResults( MemVT.getScalarSizeInBits() == 32u || MemVT.getScalarSizeInBits() == 64u)) { + EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext()); SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::LDNP, SDLoc(N), - DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), - MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), - MVT::Other}), + DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}), {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), LoadNode->getMemOperand()); SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT, - Result.getValue(0), Result.getValue(1)); + DAG.getBitcast(HalfVT, Result.getValue(0)), + DAG.getBitcast(HalfVT, Result.getValue(1))); Results.append({Pair, Result.getValue(2) /* Chain */}); return; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index d8403c2..ea63edd8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -182,6 +182,13 @@ public: MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const; + /// Replace (0, vreg) discriminator components with the operands of blend + /// or with (immediate, NoRegister) when possible. + void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, + MachineOperand &IntDiscOp, + MachineOperand &AddrDiscOp, + const TargetRegisterClass *AddrDiscRC) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -655,7 +662,7 @@ private: SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, iterator_range<SDNode::user_iterator> Users, - bool HasNoNans, const SDLoc &dl, + SDNodeFlags Flags, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index bc57537..59d4fd2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,7 +20,6 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -36,7 +35,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" -#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -533,8 +531,9 @@ bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, MBP.LHS = LastInst->getOperand(0); MBP.RHS = MachineOperand::CreateImm(0); - MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE - : MachineBranchPredicate::PRED_EQ; + MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW) + ? MachineBranchPredicate::PRED_NE + : MachineBranchPredicate::PRED_EQ; return false; } @@ -6575,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by // the target options or if FADD/FSUB has the contract fast-math flag. - return Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast || + return Options.AllowFPOpFusion == FPOpFusion::Fast || Inst.getFlag(MachineInstr::FmContract); - return true; } return false; } @@ -6681,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, case AArch64::FMUL_ZZZ_H: case AArch64::FMUL_ZZZ_S: case AArch64::FMUL_ZZZ_D: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || - (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && - Inst.getFlag(MachineInstr::MIFlag::FmNsz)); + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); // == Integer types == // -- Base instructions -- @@ -7353,9 +7349,6 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: - case AArch64MachineCombinerPattern::GATHER_LANE_i32: - case AArch64MachineCombinerPattern::GATHER_LANE_i16: - case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7396,252 +7389,11 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } -static bool getGatherPattern(MachineInstr &Root, - SmallVectorImpl<unsigned> &Patterns, - unsigned LoadLaneOpCode, unsigned NumLanes) { - const MachineFunction *MF = Root.getMF(); - - // Early exit if optimizing for size. - if (MF->getFunction().hasMinSize()) - return false; - - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - - // The root of the pattern must load into the last lane of the vector. - if (Root.getOperand(2).getImm() != NumLanes - 1) - return false; - - // Check that we have load into all lanes except lane 0. - // For each load we also want to check that: - // 1. It has a single non-debug use (since we will be replacing the virtual - // register) - // 2. That the addressing mode only uses a single offset register. - auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); - auto Range = llvm::seq<unsigned>(1, NumLanes - 1); - SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end()); - while (!RemainingLanes.empty() && CurrInstr && - CurrInstr->getOpcode() == LoadLaneOpCode && - MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && - CurrInstr->getNumOperands() == 4) { - RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); - CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); - } - - if (!RemainingLanes.empty()) - return false; - - // Match the SUBREG_TO_REG sequence. - if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) - return false; - - // Verify that the subreg to reg loads an integer into the first lane. - auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); - unsigned SingleLaneSizeInBits = 128 / NumLanes; - if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) - return false; - - // Verify that it also has a single non debug use. - if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) - return false; - - switch (NumLanes) { - case 4: - Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); - break; - case 8: - Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); - break; - case 16: - Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); - break; - default: - llvm_unreachable("Got bad number of lanes for gather pattern."); - } - - return true; -} - -/// Search for patterns where we use LD1 instructions to load into -/// separate lanes of an 128 bit Neon register. We can increase Memory Level -/// Parallelism by loading into 2 Neon registers instead. -static bool getLoadPatterns(MachineInstr &Root, - SmallVectorImpl<unsigned> &Patterns) { - - // The pattern searches for loads into single lanes. - switch (Root.getOpcode()) { - case AArch64::LD1i32: - return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); - case AArch64::LD1i16: - return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); - case AArch64::LD1i8: - return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); - default: - return false; - } -} - -static void -generateGatherPattern(MachineInstr &Root, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<Register, unsigned> &InstrIdxForVirtReg, - unsigned Pattern, unsigned NumLanes) { - - MachineFunction &MF = *Root.getParent()->getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - - // Gather the initial load instructions to build the pattern - SmallVector<MachineInstr *, 16> LoadToLaneInstrs; - MachineInstr *CurrInstr = &Root; - for (unsigned i = 0; i < NumLanes - 1; ++i) { - LoadToLaneInstrs.push_back(CurrInstr); - CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); - } - - // Sort the load instructions according to the lane. - llvm::sort(LoadToLaneInstrs, - [](const MachineInstr *A, const MachineInstr *B) { - return A->getOperand(2).getImm() > B->getOperand(2).getImm(); - }); - - MachineInstr *SubregToReg = CurrInstr; - LoadToLaneInstrs.push_back( - MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); - auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); - - const TargetRegisterClass *FPR128RegClass = - MRI.getRegClass(Root.getOperand(0).getReg()); - - auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, - Register SrcRegister, unsigned Lane, - Register OffsetRegister) { - auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); - MachineInstrBuilder LoadIndexIntoRegister = - BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), - NewRegister) - .addReg(SrcRegister) - .addImm(Lane) - .addReg(OffsetRegister, getKillRegState(true)); - InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); - InsInstrs.push_back(LoadIndexIntoRegister); - return NewRegister; - }; - - // Helper to create load instruction based on opcode - auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, - Register OffsetReg) -> MachineInstrBuilder { - unsigned Opcode; - switch (NumLanes) { - case 4: - Opcode = AArch64::LDRSui; - break; - case 8: - Opcode = AArch64::LDRHui; - break; - case 16: - Opcode = AArch64::LDRBui; - break; - default: - llvm_unreachable( - "Got unsupported number of lanes in machine-combiner gather pattern"); - } - // Immediate offset load - return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) - .addReg(OffsetReg) - .addImm(0); // immediate offset - }; - - // Load the remaining lanes into register 0. - auto LanesToLoadToReg0 = - llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, - LoadToLaneInstrsAscending.begin() + NumLanes / 2); - auto PrevReg = SubregToReg->getOperand(0).getReg(); - for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { - PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, - LoadInstr->getOperand(3).getReg()); - DelInstrs.push_back(LoadInstr); - } - auto LastLoadReg0 = PrevReg; - - // First load into register 1. Perform a LDRSui to zero out the upper lanes in - // a single instruction. - auto Lane0Load = *LoadToLaneInstrsAscending.begin(); - auto OriginalSplitLoad = - *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); - auto DestRegForMiddleIndex = MRI.createVirtualRegister( - MRI.getRegClass(Lane0Load->getOperand(0).getReg())); - - MachineInstrBuilder MiddleIndexLoadInstr = - CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, - OriginalSplitLoad->getOperand(3).getReg()); - - InstrIdxForVirtReg.insert( - std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); - InsInstrs.push_back(MiddleIndexLoadInstr); - DelInstrs.push_back(OriginalSplitLoad); - - // Subreg To Reg instruction for register 1. - auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); - unsigned SubregType; - switch (NumLanes) { - case 4: - SubregType = AArch64::ssub; - break; - case 8: - SubregType = AArch64::hsub; - break; - case 16: - SubregType = AArch64::bsub; - break; - default: - llvm_unreachable( - "Got invalid NumLanes for machine-combiner gather pattern"); - } - - auto SubRegToRegInstr = - BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), - DestRegForSubregToReg) - .addImm(0) - .addReg(DestRegForMiddleIndex, getKillRegState(true)) - .addImm(SubregType); - InstrIdxForVirtReg.insert( - std::make_pair(DestRegForSubregToReg, InsInstrs.size())); - InsInstrs.push_back(SubRegToRegInstr); - - // Load remaining lanes into register 1. - auto LanesToLoadToReg1 = - llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, - LoadToLaneInstrsAscending.end()); - PrevReg = SubRegToRegInstr->getOperand(0).getReg(); - for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { - PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, - LoadInstr->getOperand(3).getReg()); - if (Index == NumLanes / 2 - 2) { - break; - } - DelInstrs.push_back(LoadInstr); - } - auto LastLoadReg1 = PrevReg; - - // Create the final zip instruction to combine the results. - MachineInstrBuilder ZipInstr = - BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), - Root.getOperand(0).getReg()) - .addReg(LastLoadReg0) - .addReg(LastLoadReg1); - InsInstrs.push_back(ZipInstr); -} - CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: - case AArch64MachineCombinerPattern::GATHER_LANE_i32: - case AArch64MachineCombinerPattern::GATHER_LANE_i16: - case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7671,10 +7423,6 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; - // Load patterns - if (getLoadPatterns(Root, Patterns)) - return true; - return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8930,21 +8678,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } - case AArch64MachineCombinerPattern::GATHER_LANE_i32: { - generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, - Pattern, 4); - break; - } - case AArch64MachineCombinerPattern::GATHER_LANE_i16: { - generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, - Pattern, 8); - break; - } - case AArch64MachineCombinerPattern::GATHER_LANE_i8: { - generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, - Pattern, 16); - break; - } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 02734866..7c255da 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,10 +172,6 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, - - GATHER_LANE_i32, - GATHER_LANE_i16, - GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9ebdf2e..251fd44 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -519,10 +519,10 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64ldiapp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; -def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stilp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; -def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var @@ -2033,7 +2033,7 @@ let Predicates = [HasPAuth] in { def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb"), op>; } - defm PAC : SignAuth<0b000, 0b010, "pac", int_ptrauth_sign>; + defm PAC : SignAuth<0b000, 0b010, "pac", null_frag>; defm AUT : SignAuth<0b001, 0b011, "aut", null_frag>; def XPACI : ClearAuth<0, "xpaci">; @@ -2153,6 +2153,26 @@ let Predicates = [HasPAuth] in { let Uses = []; } + // PAC pseudo instruction. In AsmPrinter, it is expanded into an actual PAC* + // instruction immediately preceded by the discriminator computation. + // This enforces the expected immediate modifier is used for signing, even + // if an attacker is able to substitute AddrDisc. + def PAC : Pseudo<(outs GPR64:$SignedVal), + (ins GPR64:$Val, i32imm:$Key, i64imm:$Disc, GPR64noip:$AddrDisc), + [], "$SignedVal = $Val">, Sched<[WriteI, ReadI]> { + let isCodeGenOnly = 1; + let hasSideEffects = 0; + let mayStore = 0; + let mayLoad = 0; + let Size = 12; + let Defs = [X16, X17]; + let usesCustomInserter = 1; + } + + // A standalone pattern is used, so that literal 0 can be passed as $Disc. + def : Pat<(int_ptrauth_sign GPR64:$Val, timm:$Key, GPR64noip:$AddrDisc), + (PAC GPR64:$Val, $Key, 0, GPR64noip:$AddrDisc)>; + // AUT and re-PAC a value, using different keys/data. // This directly manipulates x16/x17, which are the only registers that // certain OSs guarantee are safe to use for sensitive operations. @@ -6648,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } +def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>; +def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>; + // int -> float conversion of value in lane 0 of simd vector should use // correct cvtf variant to avoid costly fpr <-> gpr register transfers. def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))), diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index abcd550..b97d622 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -12,7 +12,7 @@ // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri // // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi -// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// MOVi64imm + ADDXrr ==> ADDXri + ADDXri // // 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi // MOVi64imm + SUBXrr ==> SUBXri + SUBXri @@ -125,8 +125,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { template <typename T> bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI); + // Strategy used to split logical immediate bitmasks. + enum class SplitStrategy { + Intersect, + }; template <typename T> - bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0); + bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, unsigned OtherOpc = 0); bool visitORR(MachineInstr &MI); bool visitCSEL(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); @@ -158,14 +163,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", template <typename T> static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { T UImm = static_cast<T>(Imm); - if (AArch64_AM::isLogicalImmediate(UImm, RegSize)) - return false; - - // If this immediate can be handled by one instruction, do not split it. - SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; - AArch64_IMM::expandMOVImm(UImm, RegSize, Insn); - if (Insn.size() == 1) - return false; // The bitmask immediate consists of consecutive ones. Let's say there is // constant 0b00000000001000000000010000000000 which does not consist of @@ -194,8 +191,9 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) { } template <typename T> -bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, - unsigned OtherOpc) { +bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI, + SplitStrategy Strategy, + unsigned OtherOpc) { // Try below transformation. // // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri @@ -208,9 +206,26 @@ bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI, return splitTwoPartImm<T>( MI, - [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> std::optional<OpcodePair> { - if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) + [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> std::optional<OpcodePair> { + // If this immediate is already a suitable bitmask, don't split it. + // TODO: Should we just combine the two instructions in this case? + if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return std::nullopt; + + // If this immediate can be handled by one instruction, don't split it. + SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return std::nullopt; + + bool SplitSucc = false; + switch (Strategy) { + case SplitStrategy::Intersect: + SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1); + break; + } + if (SplitSucc) return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc); return std::nullopt; }, @@ -859,16 +874,20 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { Changed |= visitINSERT(MI); break; case AArch64::ANDWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI); + Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI); + Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI, + SplitStrategy::Intersect); break; case AArch64::ANDSWrr: - Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri); + Changed |= trySplitLogicalImm<uint32_t>( + AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri); break; case AArch64::ANDSXrr: - Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri); + Changed |= trySplitLogicalImm<uint64_t>( + AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri); break; case AArch64::ORRWrs: Changed |= visitORR(MI); diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index c218831..85de2d5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -36,7 +36,7 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, // SHF_AARCH64_PURECODE flag set if the "+execute-only" target feature is // present. if (TM.getMCSubtargetInfo()->hasFeature(AArch64::FeatureExecuteOnly)) { - auto *Text = cast<MCSectionELF>(TextSection); + auto *Text = static_cast<MCSectionELF *>(TextSection); Text->setFlags(Text->getFlags() | ELF::SHF_AARCH64_PURECODE); } } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 40f49da..18ca22f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4905,14 +4905,17 @@ void AArch64TTIImpl::getUnrollingPreferences( // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; - // No need to unroll auto-vectorized loops - if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) - return; - // Scan the loop: don't unroll loops with calls as this could prevent - // inlining. + // inlining. Don't unroll auto-vectorized loops either, though do allow + // unrolling of the scalar remainder. + bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized"); for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + // Both auto-vectorized loops and the scalar remainder have the + // isvectorized attribute, so differentiate between them by the presence + // of vector instructions. + if (IsVectorized && I.getType()->isVectorTy()) + return; if (isa<CallBase>(I)) { if (isa<CallInst>(I) || isa<InvokeInst>(I)) if (const Function *F = cast<CallBase>(I).getCalledFunction()) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp index 0b79850..1a15075 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp @@ -50,8 +50,10 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub, // // %sub = G_SUB 0, %y // %cmp = G_ICMP eq/ne, %z, %sub + // or with signed comparisons with the no-signed-wrap flag set if (!MaybeSub || MaybeSub->getOpcode() != TargetOpcode::G_SUB || - !CmpInst::isEquality(Pred)) + (!CmpInst::isEquality(Pred) && + !(CmpInst::isSigned(Pred) && MaybeSub->getFlag(MachineInstr::NoSWrap)))) return false; auto MaybeZero = getIConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 1381a9b..d905692 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1810,7 +1810,7 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp( // Couldn't optimize. Emit a compare + a Bcc. MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - auto PredOp = ICmp.getOperand(1); + auto &PredOp = ICmp.getOperand(1); emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( static_cast<CmpInst::Predicate>(PredOp.getPredicate())); @@ -2506,12 +2506,12 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) { return false; } auto &PredOp = Cmp->getOperand(1); - auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); - const AArch64CC::CondCode InvCC = - changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); MIB.setInstrAndDebugLoc(I); emitIntegerCompare(/*LHS=*/Cmp->getOperand(2), /*RHS=*/Cmp->getOperand(3), PredOp, MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); + const AArch64CC::CondCode InvCC = + changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB); I.eraseFromParent(); return true; @@ -3574,10 +3574,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return false; } - auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); + auto &PredOp = I.getOperand(1); + emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred)); - emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB); emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR, /*Src2=*/AArch64::WZR, InvCC, MIB); I.eraseFromParent(); @@ -5096,11 +5097,11 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { AArch64CC::CondCode CondCode; if (CondOpc == TargetOpcode::G_ICMP) { - auto Pred = - static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); + auto &PredOp = CondDef->getOperand(1); + emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp, + MIB); + auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate()); CondCode = changeICMPPredToAArch64CC(Pred); - emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), - CondDef->getOperand(1), MIB); } else { // Get the condition code for the select. auto Pred = @@ -5148,29 +5149,37 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI); MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI); auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate()); + // Given this: // // x = G_SUB 0, y - // G_ICMP x, z + // G_ICMP z, x // // Produce this: // - // cmn y, z - if (isCMN(LHSDef, P, MRI)) - return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + // cmn z, y + if (isCMN(RHSDef, P, MRI)) + return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); - // Same idea here, but with the RHS of the compare instead: + // Same idea here, but with the LHS of the compare instead: // // Given this: // // x = G_SUB 0, y - // G_ICMP z, x + // G_ICMP x, z // // Produce this: // - // cmn z, y - if (isCMN(RHSDef, P, MRI)) - return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder); + // cmn y, z + // + // But be careful! We need to swap the predicate! + if (isCMN(LHSDef, P, MRI)) { + if (!CmpInst::isEquality(P)) { + P = CmpInst::getSwappedPredicate(P); + Predicate = MachineOperand::CreatePredicate(P); + } + return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder); + } // Given this: // diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index bb0f667b..e0e1af7 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1650,6 +1650,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; }; + auto LowerTriOp = [&MI, &MIB](unsigned Opcode) { + MIB.buildInstr(Opcode, {MI.getOperand(0)}, + {MI.getOperand(2), MI.getOperand(3), MI.getOperand(4)}); + MI.eraseFromParent(); + return true; + }; Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrinsicID) { @@ -1828,6 +1834,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerBinOp(TargetOpcode::G_USUBSAT); break; } + case Intrinsic::aarch64_neon_udot: + return LowerTriOp(AArch64::G_UDOT); + case Intrinsic::aarch64_neon_sdot: + return LowerTriOp(AArch64::G_SDOT); case Intrinsic::vector_reverse: // TODO: Add support for vector_reverse diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 1cd9453..8c10673 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -228,12 +228,13 @@ void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset))); } -// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y)) -// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1)) +// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add([us]dot(x, y)) +// Or vecreduce_add(ext(mul(ext(x), ext(y)))) -> vecreduce_add([us]dot(x, y)) +// Or vecreduce_add(ext(x)) -> vecreduce_add([us]dot(x, 1)) // Similar to performVecReduceAddCombine in SelectionDAG -bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, - const AArch64Subtarget &STI, - std::tuple<Register, Register, bool> &MatchInfo) { +bool matchExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, + const AArch64Subtarget &STI, + std::tuple<Register, Register, bool> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && "Expected a G_VECREDUCE_ADD instruction"); assert(STI.hasDotProd() && "Target should have Dot Product feature"); @@ -246,31 +247,57 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32) return false; - LLT SrcTy; - auto I1Opc = I1->getOpcode(); - if (I1Opc == TargetOpcode::G_MUL) { + // Detect mul(ext, ext) with symmetric ext's. If I1Opc is G_ZEXT or G_SEXT + // then the ext's must match the same opcode. It is set to the ext opcode on + // output. + auto tryMatchingMulOfExt = [&MRI](MachineInstr *MI, Register &Out1, + Register &Out2, unsigned &I1Opc) { // If result of this has more than 1 use, then there is no point in creating - // udot instruction - if (!MRI.hasOneNonDBGUse(MidReg)) + // a dot instruction + if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; MachineInstr *ExtMI1 = - getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI); + getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); MachineInstr *ExtMI2 = - getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI); + getDefIgnoringCopies(MI->getOperand(2).getReg(), MRI); LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg()); LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg()); if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy) return false; + if ((I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) && + I1Opc != ExtMI1->getOpcode()) + return false; + Out1 = ExtMI1->getOperand(1).getReg(); + Out2 = ExtMI2->getOperand(1).getReg(); I1Opc = ExtMI1->getOpcode(); - SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg()); - std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg(); - std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg(); + return true; + }; + + LLT SrcTy; + unsigned I1Opc = I1->getOpcode(); + if (I1Opc == TargetOpcode::G_MUL) { + Register Out1, Out2; + if (!tryMatchingMulOfExt(I1, Out1, Out2, I1Opc)) + return false; + SrcTy = MRI.getType(Out1); + std::get<0>(MatchInfo) = Out1; + std::get<1>(MatchInfo) = Out2; } else if (I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) { - SrcTy = MRI.getType(I1->getOperand(1).getReg()); - std::get<0>(MatchInfo) = I1->getOperand(1).getReg(); - std::get<1>(MatchInfo) = 0; + Register I1Op = I1->getOperand(1).getReg(); + MachineInstr *M = getDefIgnoringCopies(I1Op, MRI); + Register Out1, Out2; + if (M->getOpcode() == TargetOpcode::G_MUL && + tryMatchingMulOfExt(M, Out1, Out2, I1Opc)) { + SrcTy = MRI.getType(Out1); + std::get<0>(MatchInfo) = Out1; + std::get<1>(MatchInfo) = Out2; + } else { + SrcTy = MRI.getType(I1Op); + std::get<0>(MatchInfo) = I1Op; + std::get<1>(MatchInfo) = 0; + } } else { return false; } @@ -288,11 +315,11 @@ bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } -void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &Builder, - GISelChangeObserver &Observer, - const AArch64Subtarget &STI, - std::tuple<Register, Register, bool> &MatchInfo) { +void applyExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &Builder, + GISelChangeObserver &Observer, + const AArch64Subtarget &STI, + std::tuple<Register, Register, bool> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && "Expected a G_VECREDUCE_ADD instruction"); assert(STI.hasDotProd() && "Target should have Dot Product feature"); @@ -553,15 +580,15 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, MI.eraseFromParent(); } -// Pushes ADD/SUB through extend instructions to decrease the number of extend -// instruction at the end by allowing selection of {s|u}addl sooner - +// Pushes ADD/SUB/MUL through extend instructions to decrease the number of +// extend instruction at the end by allowing selection of {s|u}addl sooner // i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, Register DstReg, Register SrcReg1, Register SrcReg2) { assert((MI.getOpcode() == TargetOpcode::G_ADD || - MI.getOpcode() == TargetOpcode::G_SUB) && - "Expected a G_ADD or G_SUB instruction\n"); + MI.getOpcode() == TargetOpcode::G_SUB || + MI.getOpcode() == TargetOpcode::G_MUL) && + "Expected a G_ADD, G_SUB or G_MUL instruction\n"); // Deal with vector types only LLT DstTy = MRI.getType(DstReg); @@ -594,9 +621,10 @@ void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); // G_SUB has to sign-extend the result. - // G_ADD needs to sext from sext and can sext or zext from zext, so the - // original opcode is used. - if (MI.getOpcode() == TargetOpcode::G_ADD) + // G_ADD needs to sext from sext and can sext or zext from zext, and G_MUL + // needs to use the original opcode so the original opcode is used for both. + if (MI.getOpcode() == TargetOpcode::G_ADD || + MI.getOpcode() == TargetOpcode::G_MUL) B.buildInstr(Opc, {DstReg}, {AddReg}); else B.buildSExt(DstReg, AddReg); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 08f547a..6257e99 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -523,7 +523,8 @@ void AArch64TargetELFStreamer::finish() { // mark it execute-only if it is empty and there is at least one // execute-only section in the object. if (any_of(Asm, [](const MCSection &Sec) { - return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_AARCH64_PURECODE; + return static_cast<const MCSectionELF &>(Sec).getFlags() & + ELF::SHF_AARCH64_PURECODE; })) { auto *Text = static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 1ac340a..a22a17a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -132,7 +132,8 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section, // But only if they don't point to a few forbidden sections. if (!Symbol.isInSection()) return true; - const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection()); + const MCSectionMachO &RefSec = + static_cast<MCSectionMachO &>(Symbol.getSection()); if (RefSec.getType() == MachO::S_CSTRING_LITERALS) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 2a36f3d..8a0c4ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts", + "HasFmaMixBF16Insts", + "true", + "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions" +>; + def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts", "HasIEEEMinimumMaximumInsts", "true", @@ -262,12 +268,30 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", "S_INST_PREFETCH instruction causes shader to hang" >; +def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts", + "HasVmemPrefInsts", + "true", + "Has flat_prefect_b8 and global_prefetch_b8 instructions" +>; + def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch", "HasSafeSmemPrefetch", "true", "SMEM prefetches do not fail on illegal address" >; +def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", + "HasSafeCUPrefetch", + "true", + "VMEM CU scope prefetches do not fail on illegal address" +>; + +def FeatureCUStores : SubtargetFeature<"cu-stores", + "HasCUStores", + "true", + "Whether SCOPE_CU stores can be used on GFX12.5" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -1365,6 +1389,9 @@ def FeatureAddSubU64Insts : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", "Has v_add_u64 and v_sub_u64 instructions">; +def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst", + "true", "Has v_mad_u32 instruction">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1970,6 +1997,7 @@ def FeatureISAVersion12 : FeatureSet< def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, + FeatureCUStores, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, @@ -2007,6 +2035,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureBF16ConversionInsts, FeatureBF16PackedInsts, FeatureCvtPkF16F32Inst, + FeatureFmaMixBF16Insts, FeatureMin3Max3PKF16, FeatureMinimum3Maximum3PKF16, FeaturePrngInst, @@ -2020,8 +2049,10 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, FeatureKernargPreload, + FeatureVmemPrefInsts, FeatureLshlAddU64Inst, FeatureAddSubU64Insts, + FeatureMadU32Inst, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2402,7 +2433,7 @@ def HasAtomicFMinFMaxF64FlatInsts : def HasLdsAtomicAddF64 : Predicate<"Subtarget->hasLdsAtomicAddF64()">, - AssemblerPredicate<(any_of FeatureGFX90AInsts)>; + AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>; def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>; @@ -2545,6 +2576,10 @@ def HasFmaakFmamkF64Insts : Predicate<"Subtarget->hasFmaakFmamkF64Insts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; +def HasAddMinMaxInsts : + Predicate<"Subtarget->hasAddMinMaxInsts()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def HasPkAddMinMaxInsts : Predicate<"Subtarget->hasPkAddMinMaxInsts()">, AssemblerPredicate<(any_of FeatureGFX1250Insts)>; @@ -2599,6 +2634,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">, def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, AssemblerPredicate<(all_of FeatureFmaMixInsts)>; +def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">, + AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>; + def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<(all_of FeatureDLInsts)>; @@ -2797,6 +2835,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, AssemblerPredicate<(all_of FeatureXF32Insts)>; +def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">, + AssemblerPredicate<(all_of FeatureVmemPrefInsts)>; + def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, AssemblerPredicate<(all_of FeatureAshrPkInsts)>; @@ -2806,6 +2847,9 @@ def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; +def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">, + AssemblerPredicate<(all_of FeatureMadU32Inst)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 4b3dc37..6681393 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (MF.getSubtarget<GCNSubtarget>().isWave32()) { + if (ST.isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } + if (isGFX1250(ST) && ST.hasCUStores()) { + KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; + } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index dedee46..59cc1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,7 +13,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" @@ -1383,7 +1382,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, - &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); + &AAIndirectCallInfo::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 5f19837..a9278c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -89,10 +89,6 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -static bool hasUnsafeFPMath(const Function &F) { - return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); -} - class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -104,7 +100,6 @@ public: const DominatorTree *DT; const UniformityInfo &UA; const DataLayout &DL; - const bool HasUnsafeFPMath; const bool HasFP32DenormalFlush; bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; @@ -117,7 +112,6 @@ public: const DominatorTree *DT, const UniformityInfo &UA) : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC), DT(DT), UA(UA), DL(F.getDataLayout()), - HasUnsafeFPMath(hasUnsafeFPMath(F)), HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals == DenormalMode::getPreserveSign()) {} @@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, return false; // v_rsq_f32 gives 1ulp - return SqrtFMF.approxFunc() || HasUnsafeFPMath || - SqrtOp->getFPAccuracy() >= 1.0f; + return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f; } Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( @@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( IRBuilder<>::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(DivFMF | SqrtFMF); - if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath || + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || canIgnoreDenormalInput(Den, CtxI)) { Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); // -1.0 / sqrt(x) -> fneg(rsq(x)) @@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( // Optimize fdiv with rcp: // // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 Value * @@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( // // With rcp: // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // -// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn. // // With fdiv.fast: // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. @@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { RsqOp = SqrtOp->getOperand(0); } - // Inaccurate rcp is allowed with unsafe-fp-math or afn. + // Inaccurate rcp is allowed with afn. // // Defer to codegen to handle this. // @@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { // expansion of afn to codegen. The current interpretation is so aggressive we // don't need any pre-consideration here when we have better information. A // more conservative interpretation could use handling here. - const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); + const bool AllowInaccurateRcp = DivFMF.approxFunc(); if (!RsqOp && AllowInaccurateRcp) return false; @@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { // We're trying to handle the fast-but-not-that-fast case only. The lowering // of fast llvm.sqrt will give the raw instruction anyway. - if (SqrtFMF.approxFunc() || HasUnsafeFPMath) + if (SqrtFMF.approxFunc()) return false; const float ReqdAccuracy = FPOp->getFPAccuracy(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 891d362..992572f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,9 +137,15 @@ def gi_global_offset : def gi_global_saddr : GIComplexOperandMatcher<s64, "selectGlobalSAddr">, GIComplexPatternEquiv<GlobalSAddr>; +def gi_global_saddr_cpol : + GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">, + GIComplexPatternEquiv<GlobalSAddrCPol>; def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; +def gi_global_saddr_no_ioffset : + GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">, + GIComplexPatternEquiv<GlobalSAddrNoIOffset>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, @@ -446,5 +452,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, + GISDNodeXFormEquiv<PrefetchLoc>; + def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">, GISDNodeXFormEquiv<MFMALdScaleXForm>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5a2416de..39b4200 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1); if (Subtarget->hasMADIntraFwdBug()) Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else if (UseNoCarry) + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp }; + + if (UseNoCarry) { + MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops); + ReplaceUses(SDValue(N, 0), SDValue(Mad, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -2020,6 +2031,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, @@ -2033,6 +2060,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); @@ -3861,58 +3906,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +// Match lowered fpext from bf16 to f32. This is a bit operation extending +// a 16-bit value with 16-bit of zeroes at LSB: +// +// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val))))) +// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true +// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false +static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) { + if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST) + return SDValue(); + Op = Op.getOperand(0); + + IsExtractHigh = false; + if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) { + auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0)); + if (!Low16 || !Low16->isZero()) + return SDValue(); + Op = stripBitcast(Op.getOperand(1)); + if (Op.getValueType() != MVT::bf16) + return SDValue(); + return Op; + } + + if (Op.getValueType() != MVT::i32) + return SDValue(); + + if (Op.getOpcode() == ISD::AND) { + if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Mask->getZExtValue() == 0xffff0000) { + IsExtractHigh = true; + return Op.getOperand(0); + } + } + return SDValue(); + } + + if (Op.getOpcode() == ISD::SHL) { + if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Amt->getZExtValue() == 16) + return Op.getOperand(0); + } + } + + return SDValue(); +} + // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + MVT VT) const { Mods = 0; SelectVOP3ModsImpl(In, Src, Mods); + bool IsExtractHigh = false; if (Src.getOpcode() == ISD::FP_EXTEND) { Src = Src.getOperand(0); - assert(Src.getValueType() == MVT::f16); - Src = stripBitcast(Src); + } else if (VT == MVT::bf16) { + SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh); + if (!B16) + return false; + Src = B16; + } else + return false; + + if (Src.getValueType() != VT && + (VT != MVT::bf16 || Src.getValueType() != MVT::i32)) + return false; - // Be careful about folding modifiers if we already have an abs. fneg is - // applied last, so we don't want to apply an earlier fneg. - if ((Mods & SISrcMods::ABS) == 0) { - unsigned ModsTmp; - SelectVOP3ModsImpl(Src, Src, ModsTmp); + Src = stripBitcast(Src); - if ((ModsTmp & SISrcMods::NEG) != 0) - Mods ^= SISrcMods::NEG; + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); - if ((ModsTmp & SISrcMods::ABS) != 0) - Mods |= SISrcMods::ABS; - } + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; - // op_sel/op_sel_hi decide the source type and source. - // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. - // If the sources's op_sel is set, it picks the high half of the source - // register. + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } - Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(Src, Src)) { - Mods |= SISrcMods::OP_SEL_0; + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the source + // register. - // TODO: Should we try to look for neg/abs here? - } + Mods |= SISrcMods::OP_SEL_1; + if (IsExtractHigh || + (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { + Mods |= SISrcMods::OP_SEL_0; - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } - return true; + // TODO: Should we try to look for neg/abs here? } - return false; + // Prevent unnecessary subreg COPY to VGPR_16 + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + } + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16)) return false; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; @@ -3921,7 +4022,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - SelectVOP3PMadMixModsImpl(In, Src, Mods); + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 6123d75..983f1aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -19,6 +19,7 @@ #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -167,9 +168,14 @@ private: bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, @@ -254,11 +260,15 @@ private: bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + MVT VT) const; bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e3ca09e..31c4f62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, - ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, - MVT::f32, Legal); + ISD::FROUNDEVEN, ISD::FTRUNC}, + {MVT::f16, MVT::f32}, Legal); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); @@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); - if (Subtarget->has16BitInsts()) + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - else { + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal); + } else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); } @@ -2632,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, if (Flags.hasApproximateFuncs()) return true; auto &Options = DAG.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, @@ -2755,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, const auto &Options = getTargetMachine().Options; if (VT == MVT::f16 || Flags.hasApproximateFuncs() || - Options.ApproxFuncFPMath || Options.UnsafeFPMath) { + Options.ApproxFuncFPMath) { if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { // Log and multiply in f32 is good enough for f16. @@ -3583,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con if (N0.getValueType() == MVT::f32) return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { // There is a generic expand for FP_TO_FP16 with unsafe fast math. return SDValue(); } @@ -4844,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -// Detect when CMP and SELECT use the same constant and fold them to avoid -// loading the constant twice. Specifically handles patterns like: -// %cmp = icmp eq i32 %val, 4242 -// %sel = select i1 %cmp, i32 4242, i32 %other -// It can be optimized to reuse %val instead of 4242 in select. -static SDValue -foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AMDGPUSubtarget *ST) { - SDValue Cond = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue FalseVal = N->getOperand(2); - - // Check if condition is a comparison. - if (Cond.getOpcode() != ISD::SETCC) - return SDValue(); - - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); - bool isInteger = LHS.getValueType().isInteger(); - - // Handle simple floating-point and integer types only. - if (!isFloatingPoint && !isInteger) - return SDValue(); - - bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); - bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); - if (!isEquality && !isNonEquality) - return SDValue(); - - SDValue ArgVal, ConstVal; - if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || - (isInteger && isa<ConstantSDNode>(RHS))) { - ConstVal = RHS; - ArgVal = LHS; - } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || - (isInteger && isa<ConstantSDNode>(LHS))) { - ConstVal = LHS; - ArgVal = RHS; - } else { - return SDValue(); - } - - // Check if constant should not be optimized - early return if not. - if (isFloatingPoint) { - const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); - const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); - - // Only optimize normal floating-point values (finite, non-zero, and - // non-subnormal as per IEEE 754), skip optimization for inlinable - // floating-point constants. - if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) - return SDValue(); - } else { - int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); - - // Skip optimization for inlinable integer immediates. - // Inlinable immediates include: -16 to 64 (inclusive). - if (IntVal >= -16 && IntVal <= 64) - return SDValue(); - } - - // For equality and non-equality comparisons, patterns: - // select (setcc x, const), const, y -> select (setcc x, const), x, y - // select (setccinv x, const), y, const -> select (setccinv x, const), y, x - if (!(isEquality && TrueVal == ConstVal) && - !(isNonEquality && FalseVal == ConstVal)) - return SDValue(); - - SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; - SDValue SelectRHS = - (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; - return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, - SelectLHS, SelectRHS); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; - // Try to fold CMP + SELECT patterns with shared constants (both FP and - // integer). - if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) - return Folded; - SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 877c3ac..b0d3b12 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -574,13 +574,22 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && + MRI->use_nodbg_empty(I.getOperand(1).getReg()); unsigned Opc; if (Subtarget->hasMADIntraFwdBug()) Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else if (UseNoCarry) + Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64 + : AMDGPU::V_MAD_NC_I64_I32_e64; else Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + + if (UseNoCarry) + I.removeOperand(1); + I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); @@ -3995,6 +4004,9 @@ bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { } unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; + if (!IsB32 && STI.hasTrue16BitInsts()) + Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64 + : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64; unsigned CBL = STI.getConstantBusLimit(Opc); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -5774,11 +5786,32 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset( + MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol, false); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); Register PtrBase; @@ -6961,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : (int64_t)SISrcMods::DST_OP_SEL); } @@ -6976,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); + (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); } void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)(SISrcMods::OP_SEL_0) : 0); } @@ -7011,8 +7044,9 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm( - (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); + MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) + ? (int64_t)SISrcMods::DST_OP_SEL + : 0); } void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, @@ -7068,6 +7102,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + uint32_t V = MI.getOperand(2).getImm(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) + << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + MIB.addImm(V); +} + /// Convert from 2-bit value to enum values used for op_sel* source modifiers. void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 5f7f05c..140e753 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -261,7 +261,11 @@ private: InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddrCPol(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrNoIOffset(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; @@ -414,6 +418,10 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 7a50923..511fc69 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode() def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">; def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">; def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">; -def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; } def FMA : Predicate<"Subtarget->hasFMA()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fedfa3f..1fdf272 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1342,13 +1342,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); if (ST.hasVOP3PInsts()) { - getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElements(0, S16, 2) - .minScalar(0, S16) - .widenScalarToNextPow2(0) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder(G_ABS) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + if (ST.hasIntMinMax64()) { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, S64, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } else { + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElements(0, S16, 2) + .minScalar(0, S16) + .widenScalarToNextPow2(0) + .scalarize(0) + .lower(); + } } else { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16}) @@ -1682,7 +1699,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasFlatAtomicFaddF32Inst()) Atomic.legalFor({{S32, FlatPtr}}); - if (ST.hasGFX90AInsts()) { + if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer @@ -2295,8 +2312,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( LLT::scalar(32), commonAlignment(Align(64), Offset)); // Pointer address - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -2317,8 +2334,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - B.buildPtrAdd(LoadAddr, QueuePtr, - B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); + B.buildObjectPtrOffset( + LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -3326,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { if (Flags & MachineInstr::FmAfn) return true; const auto &Options = MF.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, @@ -3432,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || - TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { + TM.Options.ApproxFuncFPMath) { if (Ty == F16 && !ST.has16BitInsts()) { Register LogVal = MRI.createGenericVirtualRegister(F32); auto PromoteSrc = B.buildFPExt(F32, X); @@ -4500,8 +4518,7 @@ Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, llvm_unreachable("failed to find kernarg segment ptr"); auto COffset = B.buildConstant(LLT::scalar(64), Offset); - // TODO: Should get nuw - return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); + return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0); } /// Legalize a value that's loaded from kernel arguments. This is only used by @@ -4860,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || - MF.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) { if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) @@ -4922,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || - MI.getFlag(MachineInstr::FmAfn); + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (!AllowInaccurateRcp) return false; @@ -5676,8 +5689,8 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) return false; - // FIXME: This should be nuw - B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + B.buildObjectPtrOffset(DstReg, KernargPtrReg, + B.buildConstant(IdxTy, Offset).getReg(0)); return true; } @@ -7019,8 +7032,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( // Pointer address Register LoadAddr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - B.buildPtrAdd(LoadAddr, KernargPtrReg, - B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + B.buildObjectPtrOffset(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); // Load address Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); B.buildCopy(SGPR01, Temp); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 8767208..aa75534 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -53,8 +53,6 @@ private: using FuncInfo = llvm::AMDGPULibFunc; - bool UnsafeFPMath = false; - // -fuse-native. bool AllNative = false; @@ -117,7 +115,6 @@ private: bool AllowStrictFP = false); protected: - bool isUnsafeMath(const FPMathOperator *FPOp) const; bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; @@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, return AMDGPULibFunc::parse(FMangledName, FInfo); } -bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || FPOp->isFast(); -} - bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || - (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); + return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs(); } bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( const FPMathOperator *FPOp) const { // TODO: Refine to approxFunc or contract - return isUnsafeMath(FPOp); + return FPOp->isFast(); } void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { - UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool(); AC = &FAM.getResult<AssumptionAnalysis>(F); TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index f471881..b45627d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -294,7 +294,8 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, BasePlusOffset = Base; } else { auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); - BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); + BasePlusOffset = + B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0); } auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 9b05f7c..6bca2fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3501,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingMAD_64_32(B, OpdMapper); return; case AMDGPU::G_PREFETCH: { - if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) { + if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) { MI.eraseFromParent(); return; } Register PtrReg = MI.getOperand(0).getReg(); unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); - if (PtrBank == AMDGPU::VGPRRegBankID) { + if (PtrBank == AMDGPU::VGPRRegBankID && + (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) { + // Cannot do I$ prefetch with divergent pointer. MI.eraseFromParent(); return; } unsigned AS = MRI.getType(PtrReg).getAddressSpace(); - if (!AMDGPU::isFlatGlobalAddrSpace(AS) && - AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + if ((!AMDGPU::isFlatGlobalAddrSpace(AS) && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) || + (!Subtarget.hasSafeSmemPrefetch() && + (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + !MI.getOperand(3).getImm() /* I$ prefetch */))) { MI.eraseFromParent(); return; } @@ -4004,10 +4009,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: - case AMDGPU::G_SMIN: - case AMDGPU::G_SMAX: - case AMDGPU::G_UMIN: - case AMDGPU::G_UMAX: case AMDGPU::G_ABS: case AMDGPU::G_SHUFFLE_VECTOR: case AMDGPU::G_SBFX: @@ -4017,6 +4018,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); + case AMDGPU::G_SMIN: + case AMDGPU::G_SMAX: + case AMDGPU::G_UMIN: + case AMDGPU::G_UMAX: + if (isSALUMapping(MI)) { + // There are no scalar 64-bit min and max, use vector instruction instead. + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 64 && + Subtarget.hasIntMinMax64()) + return getDefaultMappingVOP(MI); + return getDefaultMappingSOP(MI); + } + return getDefaultMappingVOP(MI); case AMDGPU::G_FADD: case AMDGPU::G_FSUB: case AMDGPU::G_FMUL: @@ -4561,8 +4574,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pknorm_u16: case Intrinsic::amdgcn_cvt_pk_i16: case Intrinsic::amdgcn_cvt_pk_u16: + case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_cvt_pk_fp8_f16: + case Intrinsic::amdgcn_cvt_pk_bf8_f16: + case Intrinsic::amdgcn_cvt_sr_fp8_f16: + case Intrinsic::amdgcn_cvt_sr_bf8_f16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: @@ -5175,6 +5193,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_load_tr16_b128: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr6_b96: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: @@ -5353,6 +5377,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); @@ -5437,6 +5469,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: + return getDefaultMappingVOP(MI); default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index a8e1967..f580f43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // If the inputs are tied and the same register, we can shortcut and // directly replace the register. - if (Src2->getReg() != CopySrcReg) { + if (!Src2->isReg() || Src2->getReg() != CopySrcReg || + Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) { LLVM_DEBUG( dbgs() << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c865082..c1f1703 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -104,7 +104,9 @@ #include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" +#include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h" #include "llvm/Transforms/Scalar/Sink.h" @@ -836,8 +838,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } @@ -916,8 +920,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. @@ -2062,7 +2068,12 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { // TODO: May want to move later or split into an early and late one. addPass(AMDGPUCodeGenPreparePass(TM)); - // TODO: LICM + // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may + // have expanded. + if (TM.getOptLevel() > CodeGenOptLevel::Less) { + addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()), + /*UseMemorySSA=*/true)); + } } Base::addIRPasses(addPass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 24f4df2..a0c99b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( // Estimate all types may be fused with contract/unsafe flags const TargetOptions &Options = TLI->getTargetMachine().Options; if (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || (FAdd->hasAllowContract() && CxtI->hasAllowContract())) return TargetTransformInfo::TCC_Free; } @@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( return LT.first * Cost * NElts; } - if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || - TLI->getTargetMachine().Options.UnsafeFPMath)) { + if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) { // Fast unsafe fdiv lowering: // f32 rcp // f32 fmul diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 421fc42..a4ea8cf 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -689,6 +689,8 @@ public: bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); } + bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); } + bool isVISrcB32() const { return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32); } @@ -2036,6 +2038,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); case AMDGPU::OPERAND_REG_IMM_BF16: @@ -2405,6 +2408,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: @@ -2456,6 +2460,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo setImmKindConst(); return; } + [[fallthrough]]; + + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: Inst.addOperand(MCOperand::createImm(Lo_32(Val))); setImmKindLiteral(); @@ -3761,6 +3768,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16) return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm()); + if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16) + return false; + llvm_unreachable("invalid operand type"); } default: @@ -6066,6 +6076,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; + } else if (ID == ".amdhsa_uses_cu_stores") { + if (!isGFX1250()) + return Error(IDRange.Start, "directive requires gfx12.5", IDRange); + + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) @@ -9415,7 +9431,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) { + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) { Inst.addOperand(Inst.getOperand(0)); } diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index f99e716..1956a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> { } //===----------------------------------------------------------------------===// -// MUBUF - GFX11, GFX12. +// MUBUF - GFX11, GFX12, GFX1250. //===----------------------------------------------------------------------===// // gfx11 instruction that accept both old and new assembler name. @@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op, def : Mnem_gfx12<gfx11_name, gfx12_name>; } +multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> : + MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>, + MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> { + def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>; +} + defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>; defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>; @@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>; defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>; +defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>; +defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">; +defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">; + //===----------------------------------------------------------------------===// // MUBUF - GFX10. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 42edec0..c466f9c 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -199,6 +199,7 @@ add_llvm_target(AMDGPUCodeGen Instrumentation MC MIRParser + ObjCARC Passes Scalar SelectionDAG diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 319cc9d..3ff675d 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0, defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>; defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>; +defm DS_ADD_F64 : DS_Real_gfx12<0x054>; +defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>; + let AssemblerPredicate = HasLdsBarrierArriveAtomic in { defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>; defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 5c1989b..ffe6b06 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + if (isGFX1250()) + PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", + KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 679c55d..d5d1074 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,8 +11,10 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>; def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>; + def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; + def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>; } @@ -368,31 +370,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { } } -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< +// Async loads, introduced in gfx1250, will store directly +// to a DS address in vdst (they will not use M0 for DS addess). +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), - " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = 1; + !if(IsAsync, (ins VGPR_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = !not(IsAsync); + let VM_CNT = !not(IsAsync); + let ASYNC_CNT = IsAsync; let is_flat_global = 1; let lds = 1; let has_data = 0; + let has_vdst = IsAsync; // vdst for ds address with IsAsync + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]); + let Defs = !if(IsAsync, [ASYNCcnt], []); + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + GlobalSaddrTable<1, opName>; +} + +class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let VM_CNT = 0; + let ASYNC_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 1; // vdata for ds address let has_vdst = 0; let mayLoad = 1; let mayStore = 1; let has_saddr = 1; let enabled_saddr = EnableSaddr; let VALU = 1; - let Uses = [M0, EXEC]; + let Uses = [EXEC, ASYNCcnt]; + let Defs = [ASYNCcnt]; let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName>, +multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { + def "" : FLAT_Global_STORE_LDS_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>, GlobalSaddrTable<1, opName>; } @@ -464,6 +503,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : + FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { + let has_vdst = 0; + let has_data = 0; + let mayLoad = 1; + let mayStore = 1; + let VM_CNT = 0; + let LGKM_CNT = 0; +} + +multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { + def "" : FLAT_Prefetch_Pseudo<opName>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let enabled_saddr = 1; + } +} + +multiclass FLAT_Global_Prefetch_Pseudo<string opName> { + let is_flat_global = 1, has_saddr = 1 in { + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let enabled_saddr = 1; + } + } +} + class FlatScratchInst <string sv_op, string mode> { string SVOp = sv_op; string Mode = mode; @@ -1124,6 +1194,15 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">; + def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus @@ -1162,6 +1241,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">; defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; +let SubtargetPredicate = isGFX125xOnly in { +defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>; +defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>; +defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>; + +defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>; +defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>; +defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = isGFX12Plus in { let Uses = [EXEC, M0] in { defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; @@ -1218,6 +1307,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in "global_atomic_pk_add_f16", VGPR_32, v2f16 >; +let SubtargetPredicate = HasVmemPrefInsts in { + defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">; + defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">; +} + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -1228,6 +1322,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN (inst $vaddr, $offset) >; +class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1249,8 +1348,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value >; class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, 0, $in) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1263,9 +1362,29 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $dsaddr, $vaddr, $offset, $cpol) +>; + +class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $dsaddr, $saddr, $voffset, $offset, $cpol) +>; + +class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)), + (inst $vaddr, $dsaddr, $offset, $cpol) +>; + +class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat < + (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)), + (inst $saddr, $voffset, $dsaddr, $offset, $cpol) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, (i32 0)) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1278,6 +1397,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + +class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))), + (inst $saddr, $voffset, $offset, $cpol) +>; + class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), @@ -1459,10 +1588,30 @@ class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT >; class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), - (inst $vaddr, $saddr, $offset, 0) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (inst $vaddr, $saddr, $offset, $cpol) >; +multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatLoadLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + +multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> { + def : FlatStoreLDSSignedPat <inst, node> { + let AddedComplexity = 10; + } + + def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatLoadSignedPat <inst, node, vt> { let AddedComplexity = 10; @@ -1473,6 +1622,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp } } +multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadSignedPat_CPOL<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatSignedLoadPat_D16 <inst, node, vt> { let AddedComplexity = 10; @@ -2009,6 +2168,28 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>; } +let OtherPredicates = [isGFX125xOnly] in { + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>; + + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; +} // End SubtargetPredicate = isGFX125xOnly + +let OtherPredicates = [isGFX1250Plus] in { + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>; + defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>; + + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>; + defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -2138,6 +2319,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f } // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +def PrefetchLoc: SDNodeXForm<timm, [{ + uint32_t V = N->getZExtValue(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; + }]; +} + +def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS || + ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); + }]; +} + +multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> { + def : GCNPat < + (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25); + } + + def : GCNPat < + (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30); + } +} + +multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { + def : GCNPat < + (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol) + >; + + def : GCNPat < + (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> { + let AddedComplexity = 11; + } +} + +let SubtargetPredicate = HasVmemPrefInsts in { + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>; + + // Patterns for forced vector prefetch with rw = 1. + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>; + + + // Patterns for target intrinsics + defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>; + defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>; +} // End SubtargetPredicate = HasVmemPrefInsts + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -3210,12 +3462,40 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>; defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>; +defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; +defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; + +defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + +defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>; defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>; +defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">; +defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">; + +defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>; +defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">; +defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">; + def True16D16Table : GenericTable { let FilterClass = "True16D16Table"; let CppTypeName = "True16D16Info"; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7d6723a..334afd3 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI) { - return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR); + return STI->isSGPRClass(RC) + ? SGPR + : (STI->isAGPRClass(RC) + ? AGPR + : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR)); } void GCNRegPressure::inc(unsigned Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 3749b6d..ea33a22 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -29,43 +29,57 @@ class raw_ostream; class SlotIndex; struct GCNRegPressure { - enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS }; + enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; GCNRegPressure() { clear(); } - bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; } + bool empty() const { + return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; + } void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } - /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p - /// UnifiedVGPRFile + /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure + /// dependent upon \p UnifiedVGPRFile unsigned getVGPRNum(bool UnifiedVGPRFile) const { if (UnifiedVGPRFile) { - return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR]) - : Value[VGPR]; + return Value[AGPR] + ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR]) + : Value[VGPR] + Value[AVGPR]; } - return std::max(Value[VGPR], Value[AGPR]); + // AVGPR assignment priority is based on the width of the register. Account + // AVGPR pressure as VGPR. + return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]); } /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs - /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file. + /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified + /// VGPR file. inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs, - unsigned NumAGPRs) { - return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + + unsigned NumAGPRs, + unsigned NumAVGPRs) { + + // Assume AVGPRs will be assigned as VGPRs. + return alignTo(NumArchVGPRs + NumAVGPRs, + AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + NumAGPRs; } - /// \returns the ArchVGPR32 pressure - unsigned getArchVGPRNum() const { return Value[VGPR]; } + /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be + /// allocated as VGPR + unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; } /// \returns the AccVGPR32 pressure unsigned getAGPRNum() const { return Value[AGPR]; } + /// \returns the AVGPR32 pressure + unsigned getAVGPRNum() const { return Value[AVGPR]; } unsigned getVGPRTuplesWeight() const { - return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]); + return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR], + Value[TOTAL_KINDS + AGPR]); } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index a655308..ce1ce68 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1911,14 +1911,12 @@ void PreRARematStage::rematerialize() { for (auto &[DefMI, Remat] : Rematerializations) { MachineBasicBlock::iterator InsertPos(Remat.UseMI); Register Reg = DefMI->getOperand(0).getReg(); - unsigned SubReg = DefMI->getOperand(0).getSubReg(); unsigned DefRegion = MIRegion.at(DefMI); // Rematerialize DefMI to its use block. - TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI, - *DAG.TRI); + TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, + AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); Remat.RematMI = &*std::prev(InsertPos); - Remat.RematMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); // Update region boundaries in regions we sinked from (remove defining MI) @@ -2064,14 +2062,13 @@ void PreRARematStage::finalizeGCNSchedStage() { MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); MachineBasicBlock *MBB = RegionBB[DefRegion]; Register Reg = RematMI.getOperand(0).getReg(); - unsigned SubReg = RematMI.getOperand(0).getSubReg(); // Re-rematerialize MI at the end of its original region. Note that it may // not be rematerialized exactly in the same position as originally within // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI); + TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, + *DAG.TRI); MachineInstr *NewMI = &*std::prev(InsertPos); - NewMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*NewMI); auto UseRegion = MIRegion.find(Remat.UseMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 9a2bab1..0a0a107 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return getMaxNumVGPRs(MF.getFunction()); } +std::pair<unsigned, unsigned> +GCNSubtarget::getMaxNumVectorRegs(const Function &F) const { + const unsigned MaxVectorRegs = getMaxNumVGPRs(F); + + unsigned MaxNumVGPRs = MaxVectorRegs; + unsigned MaxNumAGPRs = 0; + + // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, + // a wave may have up to 512 total vector registers combining together both + // VGPRs and AGPRs. Hence, in an entry function without calls and without + // AGPRs used within it, it is possible to use the whole vector register + // budget for VGPRs. + // + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (hasGFX90AInsts()) { + unsigned MinNumAGPRs = 0; + const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); + const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; + + // TODO: The lower bound should probably force the number of required + // registers up, overriding amdgpu-waves-per-eu. + std::tie(MinNumAGPRs, MaxNumAGPRs) = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR, + /*OnlyFirstRequired=*/true); + + if (MinNumAGPRs == DefaultNumAGPR.first) { + // Default to splitting half the registers if AGPRs are required. + MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; + } else { + // Align to accum_offset's allocation granularity. + MinNumAGPRs = alignTo(MinNumAGPRs, 4); + + MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); + } + + // Clamp values to be inbounds of our limits, and ensure min <= max. + + MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); + MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); + + MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); + MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); + + assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && + MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && + "invalid register counts"); + } else if (hasMAIInsts()) { + // On gfx908 the number of AGPRs always equals the number of VGPRs. + MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; + } + + return std::pair(MaxNumVGPRs, MaxNumAGPRs); +} + void GCNSubtarget::adjustSchedDependency( SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 0435e7f..bdd900d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -123,6 +123,7 @@ protected: bool HasSMemRealTime = false; bool HasIntClamp = false; bool HasFmaMixInsts = false; + bool HasFmaMixBF16Insts = false; bool HasMovrel = false; bool HasVGPRIndexMode = false; bool HasScalarDwordx3Loads = false; @@ -244,7 +245,10 @@ protected: bool HasVMEMtoScalarWriteHazard = false; bool HasSMEMtoVectorWriteHazard = false; bool HasInstFwdPrefetchBug = false; + bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; + bool HasSafeCUPrefetch = false; + bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -269,6 +273,7 @@ protected: bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; bool HasAddSubU64Insts = false; + bool HasMadU32Inst = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -462,6 +467,8 @@ public: return HasFmaMixInsts; } + bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; } + bool hasCARRY() const { return true; } @@ -709,7 +716,9 @@ public: bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); } // DS_ADD_F64/DS_ADD_RTN_F64 - bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); } + bool hasLdsAtomicAddF64() const { + return hasGFX90AInsts() || hasGFX1250Insts(); + } bool hasMultiDwordFlatScratchAddressing() const { return getGeneration() >= GFX9; @@ -987,8 +996,14 @@ public: bool hasPrefetch() const { return GFX12Insts; } + bool hasVmemPrefInsts() const { return HasVmemPrefInsts; } + bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } + bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + + bool hasCUStores() const { return HasCUStores; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1507,9 +1522,22 @@ public: // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + // \returns true if the target has V_MAD_U32 instruction. + bool hasMadU32Inst() const { return HasMadU32Inst; } + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 + // instructions. + bool hasMadU64U32NoCarry() const { return GFX1250Insts; } + + // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions. + bool hasIntMinMax64() const { return GFX1250Insts; } + + // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions. + bool hasAddMinMaxInsts() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } @@ -1658,6 +1686,10 @@ public: return getMaxNumVGPRs(F); } + /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number + /// of waves per execution unit required for the function \p MF. + std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const; + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 11b072e..15088ac 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O)) return; break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; default: llvm_unreachable("bad operand type"); } @@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index c49ad79..f358084 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm)) .value_or(255); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return 255; + case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: case AMDGPU::OPERAND_KIMM64: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 10f6d33..43ca548 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); + if (isGFX1250(STI)) + PrintField(KD.kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, + amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, + ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 3902d4c..c564145 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -208,6 +208,7 @@ enum OperandType : unsigned { OPERAND_REG_IMM_V2BF16, OPERAND_REG_IMM_V2FP16, OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_NOINLINE_V2FP16, OPERAND_REG_IMM_V2INT32, OPERAND_REG_IMM_V2FP32, @@ -392,11 +393,13 @@ enum CPol { TH_ATOMIC_CASCADE = 4, // Cascading vs regular // Scope - SCOPE = 0x3 << 3, // All Scope bits - SCOPE_CU = 0 << 3, - SCOPE_SE = 1 << 3, - SCOPE_DEV = 2 << 3, - SCOPE_SYS = 3 << 3, + SCOPE_SHIFT = 3, + SCOPE_MASK = 0x3, + SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits + SCOPE_CU = 0 << SCOPE_SHIFT, + SCOPE_SE = 1 << SCOPE_SHIFT, + SCOPE_DEV = 2 << SCOPE_SHIFT, + SCOPE_SYS = 3 << SCOPE_SHIFT, NV = 1 << 5, // Non-volatile bit diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e5d1eaa..e934152 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: @@ -1062,9 +1063,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat( switch (OpTy) { case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0); break; case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1); break; default: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f1a8ee1..ad26757 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -882,7 +882,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) + if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); if (Subtarget->hasIEEEMinimumMaximumInsts()) { @@ -909,6 +909,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, Custom); } + if (Subtarget->hasIntMinMax64()) + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64, + Legal); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, @@ -1061,10 +1065,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { // where this is OK to use. bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || - (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && + return DestVT.getScalarType() == MVT::f32 && + ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + SrcVT.getScalarType() == MVT::f16) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() && + SrcVT.getScalarType() == MVT::bf16)) && // TODO: This probably only requires no input flushing? denormalModeIsFlushAllF32(DAG.getMachineFunction()); } @@ -1254,6 +1260,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } +static unsigned getIntrMemWidth(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + return 8; + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + return 32; + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + return 64; + case Intrinsic::amdgcn_global_load_async_to_lds_b128: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: + return 128; + default: + llvm_unreachable("Unknown width"); + } +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1475,6 +1500,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1519,6 +1550,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID)); + Info.ptrVal = CI.getArgOperand(0); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: { Info.opc = ISD::INTRINSIC_VOID; @@ -1548,7 +1599,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_s_prefetch_data: { + case Intrinsic::amdgcn_s_prefetch_data: + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); @@ -1599,18 +1652,32 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_store_async_from_lds_b8: + case Intrinsic::amdgcn_global_store_async_from_lds_b32: + case Intrinsic::amdgcn_global_store_async_from_lds_b64: + case Intrinsic::amdgcn_global_store_async_from_lds_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: case Intrinsic::amdgcn_global_load_lds: + case Intrinsic::amdgcn_global_load_async_to_lds_b8: + case Intrinsic::amdgcn_global_load_async_to_lds_b32: + case Intrinsic::amdgcn_global_load_async_to_lds_b64: + case Intrinsic::amdgcn_global_load_async_to_lds_b128: Ptr = II->getArgOperand(1); break; default: @@ -4225,7 +4292,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = BaseAddr.getValue(1); Align StackAlign = TFL->getStackAlign(); if (Alignment > StackAlign) { - uint64_t ScaledAlignment = (uint64_t)Alignment.value() + uint64_t ScaledAlignment = Alignment.value() << Subtarget->getWavefrontSizeLog2(); uint64_t StackAlignMask = ScaledAlignment - 1; SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, @@ -4440,19 +4507,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, } SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { - if (Op->isDivergent()) + if (Op->isDivergent() && + (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4))) + // Cannot do I$ prefetch with divergent pointer. return SDValue(); switch (cast<MemSDNode>(Op)->getAddressSpace()) { case AMDGPUAS::FLAT_ADDRESS: case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: break; + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + if (Subtarget->hasSafeSmemPrefetch()) + break; + [[fallthrough]]; default: return SDValue(); } + // I$ prefetch + if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4)) + return SDValue(); + return Op; } @@ -7123,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { SDValue Flags = Op.getOperand(1); SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags); return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags); @@ -11218,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateRcp = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = Flags.hasApproximateFuncs(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { // Without !fpmath accuracy information, we can't do more because we don't @@ -11238,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // 1.0 / sqrt(x) -> rsq(x) - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // XXX - Is afn sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. // 1.0 / x -> rcp(x) return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); @@ -11272,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateDiv = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateDiv = Flags.hasApproximateFuncs(); if (!AllowInaccurateDiv) return SDValue(); @@ -14154,6 +14228,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget->has16BitInsts()) || + (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) || + (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) || (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) @@ -14523,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || (N0->getFlags().hasAllowContract() && N1->getFlags().hasAllowContract())) && isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { @@ -15646,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, // regardless of the denorm mode setting. Therefore, - // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. + // fp-contract is sufficient to allow generating fdot2. const TargetOptions &Options = DAG.getTarget().Options; - if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if (Options.AllowFPOpFusion == FPOpFusion::Fast || (N->getFlags().hasAllowContract() && FMA->getFlags().hasAllowContract())) { Op1 = Op1.getOperand(0); @@ -15869,6 +15945,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15917,6 +16065,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158..dedd9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -211,6 +211,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9faf497..4b48fc4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -552,7 +552,7 @@ public: (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { // FLAT and SCRATCH instructions may access scratch. Other VMEM // instructions do not. - if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratchThroughFlat(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } @@ -565,7 +565,6 @@ public: bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -1381,6 +1380,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( Modified = true; } else WaitcntInstr = &II; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + assert(ST->hasVMemToLDSLoad()); + LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II + << "Before: " << Wait.LoadCnt << '\n';); + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';); + + // It is possible (but unlikely) that this is the only wait instruction, + // in which case, we exit this loop without a WaitcntInstr to consume + // `Wait`. But that works because `Wait` was passed in by reference, and + // the callee eventually calls createNewWaitcnt on it. We test this + // possibility in an articial MIR test since such a situation cannot be + // recreated by running the memory legalizer. + II.eraseFromParent(); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1552,6 +1565,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; + } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { + // Architectures higher than GFX10 do not have direct loads to + // LDS, so no work required here yet. + II.eraseFromParent(); + continue; } else { std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2108,8 +2126,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { assert(TII->isFLAT(MI)); - // All flat instructions use the VMEM counter. - assert(TII->usesVM_CNT(MI)); + // All flat instructions use the VMEM counter except prefetch. + if (!TII->usesVM_CNT(MI)) + return false; // If there are no memory operands then conservatively assume the flat // operation may access VMEM. @@ -2159,32 +2178,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -// This is a flat memory operation. Check to see if it has memory tokens for -// either scratch or FLAT. -bool SIInsertWaitcnts::mayAccessScratchThroughFlat( - const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // SCRATCH instructions always access scratch. - if (TII->isFLATScratch(MI)) - return true; - - // GLOBAL instructions never access scratch. - if (TII->isFLATGlobal(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access scratch. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves scratch. - return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { - unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; - }); -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); @@ -2295,9 +2288,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } - // A Flat memory operation must access at least one address space. - assert(FlatASCount); - // This is a flat memory operation that access both VMEM and LDS, so note it // - it will require that both the VM and LGKM be flushed to zero if it is // pending when a VM or LGKM dependency occurs. @@ -2444,6 +2434,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + Opcode == AMDGPU::S_WAITCNT_lds_direct || counterTypeForInstr(Opcode).has_value(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 40e6871..044a681 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(DstHi); } break; + + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + assert(ST.hasBF16PackedInsts()); + MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16)); + MI.addOperand(MachineOperand::CreateImm(0)); // op_sel + MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo + MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi + auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1); + auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1); + break; } + return true; } @@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI, } bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, - const MachineOperand *MO0, unsigned OpIdx1, - const MachineOperand *MO1) const { + unsigned OpIdx1) const { const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0]; const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1]; - const TargetRegisterClass *DefinedRC1 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr; - const TargetRegisterClass *DefinedRC0 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr; unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + const MachineOperand &MO0 = MI.getOperand(OpIdx0); + const MachineOperand &MO1 = MI.getOperand(OpIdx1); + // Swap doesn't breach constant bus or literal limits // It may move literal to position other than src0, this is not allowed // pre-gfx10 However, most test cases need literals in Src0 for VOP // FIXME: After gfx9, literal can be in place other than Src0 if (isVALU(MI)) { - if ((int)OpIdx0 == Src0Idx && !MO0->isReg() && - !isInlineConstant(*MO0, OpInfo1)) + if ((int)OpIdx0 == Src0Idx && !MO0.isReg() && + !isInlineConstant(MO0, OpInfo1)) return false; - if ((int)OpIdx1 == Src0Idx && !MO1->isReg() && - !isInlineConstant(*MO1, OpInfo0)) + if ((int)OpIdx1 == Src0Idx && !MO1.isReg() && + !isInlineConstant(MO1, OpInfo0)) return false; } - if ((int)OpIdx1 != Src0Idx && MO0->isReg()) { - if (!DefinedRC1) + if ((int)OpIdx1 != Src0Idx && MO0.isReg()) { + if (OpInfo1.RegClass == -1) return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx1, *MO0) && - (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1)); + return isLegalRegOperand(MI, OpIdx1, MO0) && + (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1)); } - if ((int)OpIdx0 != Src0Idx && MO1->isReg()) { - if (!DefinedRC0) + if ((int)OpIdx0 != Src0Idx && MO1.isReg()) { + if (OpInfo0.RegClass == -1) return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; - return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) && - isLegalRegOperand(MI, OpIdx0, *MO1); + return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) && + isLegalRegOperand(MI, OpIdx0, MO1); } // No need to check 64-bit literals since swapping does not bring new // 64-bit literals into current instruction to fold to 32-bit - return isImmOperandLegal(MI, OpIdx1, *MO0); + return isImmOperandLegal(MI, OpIdx1, MO0); } MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - MachineOperand &Src1 = MI.getOperand(Src1Idx); - if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) { + if (!isLegalToSwap(MI, Src0Idx, Src1Idx)) return nullptr; - } + MachineInstr *CommutedMI = nullptr; + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src0.isReg() && Src1.isReg()) { // Be sure to copy the source modifiers to the right place. CommutedMI = @@ -4238,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } +bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { + if (!isFLAT(MI) || isFLATGlobal(MI)) + return false; + + // If scratch is not initialized, we can never access it. + if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) + return false; + + // SCRATCH instructions always access scratch. + if (isFLATScratch(MI)) + return true; + + // If there are no memory operands then conservatively assume the flat + // operation may access scratch. + if (MI.memoperands_empty()) + return true; + + // TODO (?): Does this need to be taught how to read noalias.addrspace ? + + // See if any memory operand specifies an address space that involves scratch. + return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { + unsigned AS = Memop->getAddrSpace(); + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + }); +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an @@ -4401,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { @@ -9244,6 +9283,16 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { default: if (MI.isMetaInstruction()) return 0; + + // If D16 Pseudo inst, get correct MC code size + const auto *D16Info = AMDGPU::getT16D16Helper(Opc); + if (D16Info) { + // Assume d16_lo/hi inst are always in same size + unsigned LoInstOpcode = D16Info->LoOp; + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode); + DescSize = Desc.getSize(); + } + return DescSize; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 800ea9a..e042b59 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -197,8 +197,7 @@ protected: AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const; bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, - const MachineOperand *fromMO, unsigned toIdx, - const MachineOperand *toMO) const; + unsigned toIdx) const; MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; @@ -679,6 +678,12 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with + /// SCRATCH_ memory operands. + /// Conservatively correct; will return true if \p MI cannot be proven + /// to not hit scratch. + bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 485ca78..efcc88e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">; def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; +def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">; +def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">; def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">; def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">; @@ -2857,15 +2859,18 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>; def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>; +def VOP_I16_V2F16 : VOPProfile<[i16, v2f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>; +def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>; def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>; @@ -2917,10 +2922,12 @@ def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp= def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; +def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>; +def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d05be8f..54fa192 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in def : ClampPat<V_MAX_F16_t16_e64, f16>; let SubtargetPredicate = UseFakeTrue16Insts in def : ClampPat<V_MAX_F16_fake16_e64, f16>; +// FIXME-TRUE16: Pseudo expansion of this won't work with True16. +let True16Predicate = UseFakeTrue16Insts in +def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>; let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -1903,6 +1906,13 @@ def : GCNPat < >; } +let SubtargetPredicate = HasBF16PackedInsts in { +def : GCNPat < + (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))), + (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0, + $src0_modifiers, $src0, DSTCLAMP.ENABLE) +>; +} // End SubtargetPredicate = HasBF16PackedInsts /********** ================================ **********/ /********** Floating point absolute/negative **********/ diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 9f61bf8..9509199 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, MachineRegisterInfo &MRI = MF.getRegInfo(); BitVector ReservedRegs = TRI->getReservedRegs(MF); BitVector NonWwmAllocMask(TRI->getNumRegs()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future // to have a balanced allocation between WWM values and per-thread vector @@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, NumRegs = std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs); - auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); // Try to use the highest available registers for now. Later after // vgpr-regalloc, they can be shifted to the lowest range. unsigned I = 0; @@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF, // Reserve an arbitrary register and report the error. TRI->markSuperRegs(RegMask, AMDGPU::VGPR0); MF.getFunction().getContext().emitError( - "can't find enough VGPRs for wwm-regalloc"); + "cannot find enough VGPRs for wwm-regalloc"); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index f0be204..9a1448f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm; - if (!MFMAVGPRForm && ST.hasGFX90AInsts() && - ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && - !mayUseAGPRs(F)) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + MayNeedAGPRs = ST.hasMAIInsts(); + if (ST.hasGFX90AInsts()) { + // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection + // should be separated from availability of AGPRs + if (MFMAVGPRForm || + (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && + !mayUseAGPRs(F))) + MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + } if (AMDGPU::isChainCC(CC)) { // Chain functions don't receive an SP from their caller, but are free to diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0e8a420..53f554e 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,7 +321,7 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -602,7 +602,7 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (Pos == Position::AFTER) --MI; @@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, Changed = true; } + // On architectures that support direct loads to LDS, emit an unknown waitcnt + // at workgroup-scoped release operations that specify the LDS address space. + // SIInsertWaitcnts will later replace this with a vmcnt(). + if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) && + Scope == SIAtomicScope::WORKGROUP && + (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct)); + Changed = true; + } + if (VSCnt) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) @@ -2536,9 +2556,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2551,11 +2568,26 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::expandSystemScopeStore( - MachineBasicBlock::iterator &MI) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); - if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) - return insertWaitsBeforeSystemScopeStore(MI); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) + return false; + + const unsigned Scope = CPol->getImm() & CPol::SCOPE; + + // GFX12.0 only: Extra waits needed before system scope stores. + if (!ST.hasGFX1250Insts()) { + if (!Atomic && Scope == CPol::SCOPE_SYS) + return insertWaitsBeforeSystemScopeStore(MI); + return false; + } + + // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address + // space. + // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. + if (Scope == CPol::SCOPE_CU && + (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) + return setScope(MI, CPol::SCOPE_SE); return false; } @@ -2658,6 +2690,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2674,6 +2708,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2686,7 +2721,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->expandSystemScopeStore(MI); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 84cfa87..f3acc5c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } -std::pair<unsigned, unsigned> -SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { - const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF); - - unsigned MaxNumVGPRs = MaxVectorRegs; - unsigned MaxNumAGPRs = 0; - - // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, - // a wave may have up to 512 total vector registers combining together both - // VGPRs and AGPRs. Hence, in an entry function without calls and without - // AGPRs used within it, it is possible to use the whole vector register - // budget for VGPRs. - // - // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split - // register file accordingly. - if (ST.hasGFX90AInsts()) { - unsigned MinNumAGPRs = 0; - const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs(); - const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - - const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u}; - - // TODO: Move this logic into subtarget on IR function - // - // TODO: The lower bound should probably force the number of required - // registers up, overriding amdgpu-waves-per-eu. - std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute( - MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR, - /*OnlyFirstRequired=*/true); - - if (MinNumAGPRs == DefaultNumAGPR.first) { - // Default to splitting half the registers if AGPRs are required. - MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; - } else { - // Align to accum_offset's allocation granularity. - MinNumAGPRs = alignTo(MinNumAGPRs, 4); - - MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs); - } - - // Clamp values to be inbounds of our limits, and ensure min <= max. - - MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs); - MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs); - - MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs); - MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs); - - assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs && - MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs && - "invalid register counts"); - } else if (ST.hasMAIInsts()) { - // On gfx908 the number of AGPRs always equals the number of VGPRs. - MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs; - } - - return std::pair(MaxNumVGPRs, MaxNumAGPRs); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::MODE); @@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Reserve VGPRs/AGPRs. // - auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF); + auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction()); for (const TargetRegisterClass *RC : regclasses()) { if (RC->isBaseClass() && isVGPRClass(RC)) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 0008e5f..5508f07 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -90,11 +90,6 @@ public: /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number - /// of waves per execution unit required for the function \p MF. - std::pair<unsigned, unsigned> - getMaxNumVectorRegs(const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 0039d2f..36d1a3b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> let TSFlags{2} = HasVGPR; let TSFlags{3} = HasAGPR; let TSFlags{4} = HasSGPR; + + // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) + // to decide which registers to try to assign first. Usually, this RegisterClass priority is given + // very high priority, if not the highest priority, when considering which VirtReg to allocate next. + // + // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to + // assign more constrained RegisterClasses first. As a result, we prioritize register classes with + // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). + // + // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs. + // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained + // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the + // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor + // is used for scaling of the bit (i.e. 1 << 4). + field int BaseClassPriority = 1; + field int BaseClassScaleFactor = 16; + } multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, @@ -575,7 +592,7 @@ let HasVGPR = 1 in { def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 255), (sequence "VGPR%u_HI16", 0, 255)))> { - let AllocationPriority = 2; + let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; @@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types // Identical to VGPR_32 except it only contains the low 128 (Lo128) registers. def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 127))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // AccVGPR 32-bit registers def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add (sequence "AGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = !sub(numRegs, 1); + + // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the + // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result + // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for + // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one + // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, + // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. + defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); + + let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor)); let Weight = numRegs; } // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let HasVGPR = 1 in { + let HasVGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; + let BaseClassPriority = 0; let Size = 32; } } // End GeneratePressureSet = 0 @@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3 // aligned base register. multiclass AVRegClass<int numRegs, list<ValueType> regTypes, dag vregList, dag aregList> { - let HasVGPR = 1, HasAGPR = 1 in { + let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>; @@ -1191,6 +1218,8 @@ def VSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_FP64"> { def VSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2INT32">; def VSrc_v2f32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_IMM_V2FP32">; +def VSrc_NoInline_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">; + //===----------------------------------------------------------------------===// // VRegSrc_* Operands with a VGPR //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 38cc51b..4bda51d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -856,9 +856,9 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>; def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), (prefetch node:$ptr, node:$rw, node:$loc, node:$type), - [{ return !N->getOperand(1)->isDivergent();}]> { + [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> { let GISelPredicateCode = [{ - return isInstrUniform(MI); + return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch(); }]; } @@ -1152,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> { } defm : SMPrefetchPat<"INST", i32imm_zero>; +let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case. defm : SMPrefetchPat<"DATA", i32imm_one>; let SubtargetPredicate = isGFX12Plus in { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e103ccc..8303410 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +// Represents the point at which a wave must wait for all outstanding direct loads to LDS. +// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. + +def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { + let hasSideEffects = 0; +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b5b3cc9..5827f18 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) { } bool isAsyncStore(unsigned Opc) { - return false; // placeholder before async store implementation. + return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250; } bool isTensorStore(unsigned Opc) { @@ -2652,6 +2659,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: @@ -3016,6 +3024,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return isInlinableLiteralV2BF16(Literal); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; default: llvm_unreachable("bad packed operand type"); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index c09a9d6..74d59f4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1636,6 +1636,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: return 2; default: diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 550ec9d..9de7d6d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1344,6 +1344,8 @@ def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">; } // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit] let SubtargetPredicate = HasPkFmacF16Inst in { +// FIXME: V_PK_FMAC_F16 is currently not used in instruction selection. +// If this changes, ensure the DPP variant is not used for GFX11+. defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; } // End SubtargetPredicate = HasPkFmacF16Inst @@ -1904,7 +1906,7 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName, VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> : - VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>; + VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>; multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> : VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index b6f9568..2d3caec 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -32,9 +32,10 @@ class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { let HasExtDPP = 0; } -let HasExt64BitDPP = 1 in { -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; +def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; @@ -44,6 +45,10 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; } +let HasExt64BitDPP = 1 in { +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; + class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { let HasExtVOP3DPP = 0; let HasExtDPP = 0; @@ -52,10 +57,13 @@ class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { def V_LSHL_ADD_U64_PROF : VOP3_Profile<VOP_I64_I64_I32_I64>; def VOP_F64_F64_F64_F64_DPP_PROF : VOP3_Profile<VOP_F64_F64_F64_F64>; - -def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { +def V_MAD_U32_PROF: VOP3_Profile<VOP_I32_I32_I32_I32> { let HasExtVOP3DPP = 0; - let HasExtDPP = 0; + let HasExt64BitDPP = 1; +} +def VOP_I64_I64_I64_DPP : VOP3_Profile<VOP_I64_I64_I64>; +def VOP_I32_I32_I64_DPP : VOP3_Profile<VOPProfile<[i64, i32, i32, i64]>> { + let HasClamp = 1; } } // End HasExt64BitDPP = 1; @@ -152,6 +160,15 @@ defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32 defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">; defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; +let SchedRW = [WriteIntMul] in { + let SubtargetPredicate = HasMadU32Inst in + defm V_MAD_U32 : VOP3Inst <"v_mad_u32", V_MAD_U32_PROF>; + let SubtargetPredicate = isGFX1250Plus in { + defm V_MAD_NC_U64_U32 : VOP3Inst<"v_mad_nc_u64_u32", VOP_I32_I32_I64_DPP>; + defm V_MAD_NC_I64_I32 : VOP3Inst<"v_mad_nc_i64_i32", VOP_I32_I32_I64_DPP>; + } +} + let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">; @@ -185,6 +202,13 @@ defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, f } // End SchedRW = [WriteDoubleAdd] } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1 +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in { +defm V_MAX_I64 : VOP3Inst <"v_max_i64", VOP_I64_I64_I64_DPP, smax>; +defm V_MAX_U64 : VOP3Inst <"v_max_u64", VOP_I64_I64_I64_DPP, umax>; +defm V_MIN_I64 : VOP3Inst <"v_min_i64", VOP_I64_I64_I64_DPP, smin>; +defm V_MIN_U64 : VOP3Inst <"v_min_u64", VOP_I64_I64_I64_DPP, umin>; +} // End SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] + } // End isReMaterializable = 1 let Uses = [MODE, VCC, EXEC] in { @@ -722,6 +746,13 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in { defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>; } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 +let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in { + defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>; + defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>; +} + defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>; defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>; @@ -848,6 +879,9 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>; +let SubtargetPredicate = HasMadU32Inst, AddedComplexity = 10 in + def : ThreeOp_i32_Pats<mul, add, V_MAD_U32_e64>; + def : GCNPat< (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; @@ -858,6 +892,13 @@ def : GCNPat< (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +let SubtargetPredicate = HasAddMinMaxInsts in { +def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +} + def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>; @@ -972,10 +1013,12 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim unsigned Val = N->getZExtValue(); unsigned New = 0; if (}] # modifier_idx # [{ == 0) { - New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) - : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); - } else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) { - New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL) + : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE); + } else if (}] # modifier_idx # [{== 1) { + New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + } if (}] # modifier_idx # [{== 2) { + New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; } return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32); }]>; @@ -1427,34 +1470,72 @@ let SubtargetPredicate = isGFX12Plus in { } // End SubtargetPredicate = isGFX12Plus -let SubtargetPredicate = HasBitOp3Insts in { +let HasClamp = 0, HasModifiers = 1 in { +def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>; +def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>; +def BitOp3_B16_fake16_Profile : VOP3_Profile_Fake16<BitOp3_B16_Profile>; +} + +let OtherPredicates = [HasBitOp3Insts] in { let isReMaterializable = 1 in { - defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", - VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>>; + let SubtargetPredicate = isGFX940Plus in + defm V_BITOP3_B16 : VOP3Inst <"v_bitop3_b16", BitOp3_B16_Profile>; + let SubtargetPredicate = isGFX1250Plus in + defm V_BITOP3_B16_gfx1250 : VOP3Inst_t16_with_profiles <"v_bitop3_b16_gfx1250", BitOp3_B16_Profile, + BitOp3_B16_t16_Profile, BitOp3_B16_fake16_Profile>; defm V_BITOP3_B32 : VOP3Inst <"v_bitop3_b32", VOP3_BITOP3_Profile<VOPProfile <[i32, i32, i32, i32, i32]>, VOP3_REGULAR>>, VOPD_Component<0x12, "v_bitop2_b32">; } + def : GCNPat< (i32 (int_amdgcn_bitop3 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; def : GCNPat< - (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; - - def : GCNPat< (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i32:$bitop3)), (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3)) >; - def : GCNPat< - (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), - (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) - >; -} // End SubtargetPredicate = HasBitOp3Insts + let SubtargetPredicate = isGFX940Plus in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } // End SubtargetPredicate = isGFX940Plus + + let SubtargetPredicate = isGFX1250Plus in { + let True16Predicate = UseFakeTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_fake16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0)) + >; + } + let True16Predicate = UseRealTrue16Insts in { + def : GCNPat< + (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + + def : GCNPat< + (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i32:$bitop3)), + (i16 (V_BITOP3_B16_gfx1250_t16_e64 0, VSrcT_b16:$src0, 0, VSrcT_b16:$src1, 0, VSrcT_b16:$src2, timm:$bitop3, 0)) + >; + } + } // End SubtargetPredicate = isGFX1250Plus + +} // End OtherPredicates = [HasBitOp3Insts] class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), @@ -1531,6 +1612,7 @@ def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; let SubtargetPredicate = HasBF16ConversionInsts in { let ReadsModeReg = 0 in { defm V_CVT_PK_BF16_F32 : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>; + defm V_CVT_SR_PK_BF16_F32 : VOP3Inst<"v_cvt_sr_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_bf16_f32>; } def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)), (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>; @@ -1541,6 +1623,53 @@ let SubtargetPredicate = HasBF16ConversionInsts in { (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>; } +let Src0RC64 = VSrc_NoInline_v2f16 in { +def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; +def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; +def VOP3_CVT_PK_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_PK_F8_F16_Profile>; +} + +let ReadsModeReg = 0, IsPacked = 0, SubtargetPredicate = isGFX125xOnly in { + defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_fp8_f16>; + defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f16_gfx1250", + VOP3_CVT_PK_F8_F16_Profile, + VOP3_CVT_PK_F8_F16_True16_Profile, + VOP3_CVT_PK_F8_F16_Fake16_Profile, + int_amdgcn_cvt_pk_bf8_f16>; +} + +let HasClamp = 0, HasOpSel = 1 in { +def VOP3_CVT_SR_F8_F16_Profile : VOP3_CVT_SR_F8_ByteSel_Profile<f16>; +def VOP3_CVT_SR_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_SR_F8_F16_Profile>; +def VOP3_CVT_SR_F8_F16_Fake16_Profile : VOP3_Profile_Fake16<VOP3_CVT_SR_F8_F16_Profile>; +} + +let SubtargetPredicate = isGFX1250Plus in { + let ReadsModeReg = 0 in { + // These instructions have non-standard use of op_sel. They are using bits 2 and 3 of opsel + // to select a byte in the vdst. Bits 0 and 1 are unused. + let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + defm V_CVT_SR_FP8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_fp8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile, + VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; + } + } // End ReadsModeReg = 0 + + let True16Predicate = UseRealTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>; + } + let True16Predicate = UseFakeTrue16Insts in { + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_fake16_e64, f16>; + def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_fake16_e64, f16>; + } +} // End SubtargetPredicate = isGFX1250Plus + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -1746,6 +1875,21 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_BITOP3_B16_gfx1250 : VOP3_Real_BITOP3_t16_and_fake16_gfx1250<0x233, "v_bitop3_b16">; +defm V_BITOP3_B32 : VOP3_Real_BITOP3_gfx1250<0x234>; + +defm V_MAD_U32 : VOP3Only_Realtriple_gfx1250<0x235>; +defm V_MAD_NC_U64_U32 : VOP3Only_Realtriple_gfx1250<0x2fa>; +defm V_MAD_NC_I64_I32 : VOP3Only_Realtriple_gfx1250<0x2fb>; +defm V_MIN_U64 : VOP3Only_Realtriple_gfx1250<0x318>; +defm V_MAX_U64 : VOP3Only_Realtriple_gfx1250<0x319>; +defm V_MIN_I64 : VOP3Only_Realtriple_gfx1250<0x31a>; +defm V_MAX_I64 : VOP3Only_Realtriple_gfx1250<0x31b>; +defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>; +defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>; +defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>; +defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>; + defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">; defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">; defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >; @@ -1918,7 +2062,14 @@ let AssemblerPredicate = isGFX11Plus in { // These instructions differ from GFX12 variant by supporting DPP: defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; +defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; +defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; +defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; +defm V_CVT_PK_FP8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x372, "v_cvt_pk_fp8_f16">; +defm V_CVT_PK_BF8_F16_gfx1250 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x373, "v_cvt_pk_bf8_f16">; +defm V_CVT_SR_FP8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x374>; +defm V_CVT_SR_BF8_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx1250<0x375>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index ea14c77..95fcd4a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> { bit UseTiedOutput = useTiedOutput; + defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret; + defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret; + defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret; + dag srcs = - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + (ins FP16InputMods:$src0_modifiers, Src0RC:$src0, + FP16InputMods:$src1_modifiers, Src1RC:$src1, + FP16InputMods:$src2_modifiers, Src2RC:$src2); dag dpp_srcs = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + FP16InputMods:$src2_modifiers, Src2RC:$src2); // FIXME: Clamp0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -161,38 +165,42 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_ multiclass MadFmaMixPats<SDPatternOperator fma_like, Instruction mix_inst, Instruction mixlo_inst, - Instruction mixhi_inst> { + Instruction mixhi_inst, + ValueType VT = f16, + ValueType vecVT = v2f16> { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < (AMDGPUclamp (build_vector - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), - (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0, $hi_src1_modifiers, $hi_src1, $hi_src2_modifiers, $hi_src2, DSTCLAMP.ENABLE, @@ -204,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), @@ -214,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), DSTCLAMP.NONE, @@ -224,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -241,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = p in { def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, @@ -253,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, @@ -268,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1), - (v2f16 (mixlo_inst $src0_modifiers, $src0, + (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1), + (vecVT (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16))) + (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16))) >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; } // end True16Predicate } @@ -360,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; } +let SubtargetPredicate = HasFmaMixBF16Insts in { +let isCommutable = 1 in { + +let isReMaterializable = 1 in +defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>; + +let FPDPRounding = 1 in { +defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; + +let ClampLo = 0, ClampHi = 1 in { +defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; +} +} // End FPDPRounding = 1 +} // End isCommutable = 1 + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>; +} // End SubtargetPredicate = HasFmaMixBF16Insts + def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { let HasModifiers = 0; } @@ -1210,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>; defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>; defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>; + + // Scalar pseudo used to emulate AMDGPUClamp. + // Expanded to V_PK_MAX_NUM_BF16 with unused high half. + // FIXME-TRUE16: Pseudo expansion of this won't work with True16. + let True16Predicate = UseFakeTrue16Insts in + defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>; } } // End isCommutable = 1, isReMaterializable = 1 @@ -2247,6 +2279,13 @@ defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; +defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; +defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; +defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; + +let AssemblerPredicate = isGFX1250Plus in +def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; + defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index c21e2d3..a029376 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -401,6 +401,19 @@ class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { let Inst{49-41} = src0; } +class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; @@ -1506,6 +1519,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1525,6 +1539,7 @@ class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1540,6 +1555,7 @@ class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : let HasFP8SrcByteSel = P.HasFP8SrcByteSel; let HasFP8DstByteSel = P.HasFP8DstByteSel; let HasOMod = P.HasOMod; + let HasBitOp3 = P.HasBitOp3; let HasModifiers = !if (Features.IsMAI, 0, @@ -1723,6 +1739,34 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> let Inst{14 - 8} = sdst; } +class VOP3_BITOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo p, GFXGen Gen, string asmName> + : VOP3_DPP16_Gen_t16<op, p, Gen, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + +class VOP3_BITOP3_DPP8<bits<10> op, VOP_Pseudo p, string asmName> + : Base_VOP3_DPP8_t16<op, p, asmName> { + bits<8> bitop3; + + let Inst{60-59} = bitop3{7-6}; + let Inst{10-8} = bitop3{5-3}; + let Inst{63-61} = bitop3{2-0}; + + let Inst{11} = !if(p.Pfl.HasOpSel, src0_modifiers{2}, 0); + let Inst{12} = !if(p.Pfl.HasOpSel, src1_modifiers{2}, 0); + let Inst{13} = !if(p.Pfl.HasOpSel, src2_modifiers{2}, 0); + let Inst{14} = !if(p.Pfl.HasOpSel, src0_modifiers{3}, 0); +} + class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : Base_VOP3_DPP8<op, ps, opName> { bits<8> sdst; @@ -1943,6 +1987,29 @@ multiclass VOP3be_Realtriple< multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> : VOP3be_Realtriple<Gen, op, 1>; +multiclass VOP3_BITOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName> { + def _e64_dpp#Gen.Suffix : + VOP3_BITOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(NAME#"_e64"#"_dpp"), Gen, asmName>; +} + +multiclass VOP3_BITOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP3_Pseudo>(NAME#"_e64"); + def _e64_dpp8#Gen.Suffix : VOP3_BITOP3_DPP8<op, ps, asmName> { + let DecoderNamespace = + Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16"); + let AssemblerPredicate = Gen.AssemblerPredicate; + } +} + +multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + let IsSingle = ps.Pfl.IsSingle, AsmString = asmName # ps.AsmOperands in { + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3a_BITOP3_gfx12<op, ps.Pfl>; + } +} + //===----------------------------------------------------------------------===// // VOP3 GFX11 //===----------------------------------------------------------------------===// @@ -2004,6 +2071,15 @@ multiclass VOP3Only_Real_Base_gfx1250<bits<10> op> : multiclass VOP3Only_Realtriple_gfx1250<bits<10> op, bit isSingle = 0> : VOP3_Realtriple<GFX1250Gen, op, isSingle>; +multiclass VOP3Only_Realtriple_with_name_gfx1250<bits<10> op, string opName, + string asmName, string pseudo_mnemonic = "", + bit isSingle = 0> : + VOP3_Realtriple_with_name<GFX1250Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; + +multiclass VOP3Only_Realtriple_t16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : + VOP3Only_Realtriple_with_name_gfx1250<op, opName, asmName, pseudo_mnemonic, isSingle>; + multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>; @@ -2024,6 +2100,13 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>; } +multiclass VOP3Only_Realtriple_t16_and_fake16_gfx1250<bits<10> op, + string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + string opName = NAME, string pseudo_mnemonic = ""> { + defm _t16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_t16", pseudo_mnemonic>; + defm _fake16 : VOP3Only_Realtriple_t16_gfx1250<op, asmName, opName#"_fake16", pseudo_mnemonic>; +} + multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName, string asmName, bit isSingle = 0> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); @@ -2046,6 +2129,16 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>, VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>; +multiclass VOP3_Real_BITOP3_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : + VOP3_BITOP3_Real_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp_Base<GFX1250Gen, op, asmName>, + VOP3_BITOP3_Real_dpp8_Base<GFX1250Gen, op, asmName>; + +multiclass VOP3_Real_BITOP3_t16_and_fake16_gfx1250<bits<10> op, string asmName = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> { + defm _t16 : VOP3_Real_BITOP3_gfx1250<op, asmName>; + defm _fake16: VOP3_Real_BITOP3_gfx1250<op, asmName>; +} + multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0, string opName = NAME> : VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 50217c3..9e4dbec 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl( // instructions). if (Latency > 0 && Subtarget.isThumb2()) { const MachineFunction *MF = DefMI.getParent()->getParent(); - // FIXME: Use Function::hasOptSize(). - if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize)) + if (MF->getFunction().hasOptSize()) --Latency; } return Latency; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8b7f06a..bd4b75f 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); @@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FROUND, MVT::f16, Legal); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); + setOperationAction(ISD::FTRUNC, MVT::f16, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); + setOperationAction(ISD::FRINT, MVT::f16, Legal); + setOperationAction(ISD::FFLOOR, MVT::f16, Legal); + setOperationAction(ISD::FCEIL, MVT::f16, Legal); } if (Subtarget->hasNEON()) { @@ -2412,6 +2423,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallingConv::ID CallConv = CLI.CallConv; bool doesNotRet = CLI.DoesNotReturn; bool isVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -2435,6 +2447,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, !Subtarget->noBTIAtReturnTwice()) GuardWithBTI = AFI->branchTargetEnforcement(); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + // Determine whether this is a non-secure function call. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) isCmseNSCall = true; @@ -20347,6 +20363,13 @@ ARMTargetLowering::getSingleConstraintMatchWeight( return weight; } +static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) { + if (PR == 0 || VT == MVT::Other) + return false; + return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) || + (ARM::DPRRegClass.contains(PR) && VT != MVT::f64); +} + using RCPair = std::pair<unsigned, const TargetRegisterClass *>; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( @@ -20420,7 +20443,10 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( if (StringRef("{cc}").equals_insensitive(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + if (isIncompatibleReg(RCP.first, VT)) + return {0, nullptr}; + return RCP; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index ec6f4e2..ece6c10 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -12327,7 +12327,7 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) { } assert(Section && "must have section to emit alignment"); - if (Section->useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(*Section)) getStreamer().emitCodeAlignment(Align(2), &getSTI()); else getStreamer().emitValueToAlignment(Align(2)); @@ -12525,7 +12525,7 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) { // '.align' is target specifically handled to mean 2**2 byte alignment. const MCSection *Section = getStreamer().getCurrentSectionOnly(); assert(Section && "must have section to emit alignment"); - if (Section->useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(*Section)) getStreamer().emitCodeAlignment(Align(4), &getSTI(), 0); else getStreamer().emitValueToAlignment(Align(4), 0, 1, 0); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index a7a9911..6dfe846 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -708,8 +708,6 @@ private: void SwitchToExTabSection(const MCSymbol &FnStart); void SwitchToExIdxSection(const MCSymbol &FnStart); - void EmitFixup(const MCExpr *Expr, MCFixupKind Kind); - bool IsThumb; bool IsAndroid; @@ -1096,8 +1094,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { } void ARMTargetELFStreamer::annotateTLSDescriptorSequence( - const MCSymbolRefExpr *S) { - getStreamer().EmitFixup(S, FK_Data_4); + const MCSymbolRefExpr *Expr) { + getStreamer().addFixup(Expr, FK_Data_4); } void ARMTargetELFStreamer::emitCode16() { getStreamer().setIsThumb(true); } @@ -1140,7 +1138,8 @@ void ARMTargetELFStreamer::finish() { MCContext &Ctx = getContext(); auto &Asm = getStreamer().getAssembler(); if (any_of(Asm, [](const MCSection &Sec) { - return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_ARM_PURECODE; + return static_cast<const MCSectionELF &>(Sec).getFlags() & + ELF::SHF_ARM_PURECODE; })) { auto *Text = static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); @@ -1206,11 +1205,6 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { SectionKind::getData(), FnStart); } -void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { - MCFragment *Frag = getCurrentFragment(); - Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind)); -} - void ARMELFStreamer::EHReset() { ExTab = nullptr; FnStart = nullptr; @@ -1290,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; } // Add the R_ARM_NONE fixup at the same position void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name); + visitUsedSymbol(*PersonalitySym); const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext()); - - visitUsedExpr(*PersonalityRef); - MCFragment *DF = getCurrentFragment(); - DF->addFixup( - MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4)); + addFixup(PersonalityRef, FK_Data_4); } void ARMELFStreamer::FlushPendingOffset() { diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp index ad8aa571..0fb33cd 100644 --- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp @@ -260,7 +260,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) { continue; } - auto *Section = cast<MCSectionELF>(TLOF.SectionForGlobal(&GO, TM)); + auto *Section = static_cast<MCSectionELF *>(TLOF.SectionForGlobal(&GO, TM)); if (Section->getName().starts_with(".data")) NeedsCopyData = true; else if (Section->getName().starts_with(".rodata") && SubTM->hasLPM()) diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp index c2c1bb4..0615ec9 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp @@ -24,8 +24,6 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) { CalleeSaveStackSlotSize = 2; CommentString = ";"; SeparatorString = "$"; - PrivateGlobalPrefix = ".L"; - PrivateLabelPrefix = ".L"; UsesELFSectionDirectiveForBSS = true; SupportsDebugInformation = true; } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h index fab2713..1915fa8 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h @@ -14,7 +14,7 @@ #define LLVM_AVR_ASM_INFO_H #include "MCTargetDesc/AVRMCExpr.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" #include "llvm/MC/MCExpr.h" namespace llvm { @@ -22,7 +22,7 @@ namespace llvm { class Triple; /// Specifies the format of AVR assembly files. -class AVRMCAsmInfo : public MCAsmInfo { +class AVRMCAsmInfo : public MCAsmInfoELF { public: explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options); void printSpecifierExpr(raw_ostream &OS, diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index 1e29a0f..bed6bc9 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) { return; } - // MapDef type may be a struct type or a non-pointer derived type - const DIType *OrigTy = Ty; - while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { - auto Tag = DTy->getTag(); - if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && - Tag != dwarf::DW_TAG_volatile_type && - Tag != dwarf::DW_TAG_restrict_type) - break; - Ty = DTy->getBaseType(); - } - - const auto *CTy = dyn_cast<DICompositeType>(Ty); - if (!CTy) - return; - - auto Tag = CTy->getTag(); - if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl()) - return; - - // Visit all struct members to ensure their types are visited. - const DINodeArray Elements = CTy->getElements(); - for (const auto *Element : Elements) { - const auto *MemberType = cast<DIDerivedType>(Element); - const DIType *MemberBaseType = MemberType->getBaseType(); - - // If the member is a composite type, that may indicate the currently - // visited composite type is a wrapper, and the member represents the - // actual map definition. - // In that case, visit the member with `visitMapDefType` instead of - // `visitTypeEntry`, treating it specifically as a map definition rather - // than as a regular composite type. - const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); - if (MemberCTy) { - visitMapDefType(MemberBaseType, TypeId); - } else { - visitTypeEntry(MemberBaseType); + uint32_t TmpId; + switch (Ty->getTag()) { + case dwarf::DW_TAG_typedef: + case dwarf::DW_TAG_const_type: + case dwarf::DW_TAG_volatile_type: + case dwarf::DW_TAG_restrict_type: + case dwarf::DW_TAG_pointer_type: + visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_array_type: + // Visit nested map array and jump to the element type + visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_structure_type: { + // Visit all struct members to ensure their types are visited. + const auto *CTy = cast<DICompositeType>(Ty); + const DINodeArray Elements = CTy->getElements(); + for (const auto *Element : Elements) { + const auto *MemberType = cast<DIDerivedType>(Element); + const DIType *MemberBaseType = MemberType->getBaseType(); + // If the member is a composite type, that may indicate the currently + // visited composite type is a wrapper, and the member represents the + // actual map definition. + // In that case, visit the member with `visitMapDefType` instead of + // `visitTypeEntry`, treating it specifically as a map definition rather + // than as a regular composite type. + const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); + if (MemberCTy) { + visitMapDefType(MemberBaseType, TmpId); + } else { + visitTypeEntry(MemberBaseType); + } } + break; + } + default: + break; } // Visit this type, struct or a const/typedef/volatile/restrict type - visitTypeEntry(OrigTy, TypeId, false, false); + visitTypeEntry(Ty, TypeId, false, false); } /// Read file contents from the actual file or from the source @@ -1255,10 +1255,8 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) { FuncInfo.Label = FuncLabel; FuncInfo.TypeId = FuncTypeId; if (FuncLabel->isInSection()) { - MCSection &Section = FuncLabel->getSection(); - const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section); - assert(SectionELF && "Null section for Function Label"); - SecNameOff = addString(SectionELF->getName()); + auto &Sec = static_cast<const MCSectionELF &>(FuncLabel->getSection()); + SecNameOff = addString(Sec.getName()); } else { SecNameOff = addString(".text"); } diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 827e928..bb74f6a 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -54,11 +54,8 @@ unsigned BPFELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCSymbol &Sym = *A; if (Sym.isDefined()) { - MCSection &Section = Sym.getSection(); - const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section); - assert(SectionELF && "Null section for reloc symbol"); - - unsigned Flags = SectionELF->getFlags(); + auto &Section = static_cast<const MCSectionELF &>(Sym.getSection()); + unsigned Flags = Section.getFlags(); if (Sym.isTemporary()) { // .BTF.ext generates FK_Data_4 relocations for diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index 7b21684..63d6e6f 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -13,18 +13,19 @@ #ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H #define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" #include "llvm/TargetParser/Triple.h" namespace llvm { -class BPFMCAsmInfo : public MCAsmInfo { +class BPFMCAsmInfo : public MCAsmInfoELF { public: explicit BPFMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) { if (TT.getArch() == Triple::bpfeb) IsLittleEndian = false; PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = "L"; WeakRefDirective = "\t.weak\t"; UsesELFSectionDirectiveForBSS = true; diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp index f0e2e78..7e1436e 100644 --- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp +++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp @@ -263,8 +263,13 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) { // merge the byte offsets. Otherwise, this GEP is itself the root of a GEP // chain and we need to deterine the root array type if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) { - assert(GEPChainInfoMap.contains(PtrOpGEP) && - "Expected parent GEP to be visited before this GEP"); + + // If the parent GEP was not processed, then we do not want to process its + // descendants. This can happen if the GEP chain is for an unsupported type + // such as a struct -- we do not flatten structs nor GEP chains for structs + if (!GEPChainInfoMap.contains(PtrOpGEP)) + return false; + GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP]; Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType; Info.RootPointerOperand = PGEPInfo.RootPointerOperand; diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index c73648f..3427968 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -24,18 +24,19 @@ using namespace llvm; -static void legalizeFreeze(Instruction &I, +static bool legalizeFreeze(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *>) { auto *FI = dyn_cast<FreezeInst>(&I); if (!FI) - return; + return false; FI->replaceAllUsesWith(FI->getOperand(0)); ToRemove.push_back(FI); + return true; } -static void fixI8UseChain(Instruction &I, +static bool fixI8UseChain(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { @@ -74,19 +75,19 @@ static void fixI8UseChain(Instruction &I, if (Trunc->getDestTy()->isIntegerTy(8)) { ReplacedValues[Trunc] = Trunc->getOperand(0); ToRemove.push_back(Trunc); - return; + return true; } } if (auto *Store = dyn_cast<StoreInst>(&I)) { if (!Store->getValueOperand()->getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewStore = Builder.CreateStore(NewOperands[0], NewOperands[1]); ReplacedValues[Store] = NewStore; ToRemove.push_back(Store); - return; + return true; } if (auto *Load = dyn_cast<LoadInst>(&I); @@ -104,17 +105,17 @@ static void fixI8UseChain(Instruction &I, LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]); ReplacedValues[Load] = NewLoad; ToRemove.push_back(Load); - return; + return true; } if (auto *Load = dyn_cast<LoadInst>(&I); Load && isa<ConstantExpr>(Load->getPointerOperand())) { auto *CE = dyn_cast<ConstantExpr>(Load->getPointerOperand()); if (!(CE->getOpcode() == Instruction::GetElementPtr)) - return; + return false; auto *GEP = dyn_cast<GEPOperator>(CE); if (!GEP->getSourceElementType()->isIntegerTy(8)) - return; + return false; Type *ElementType = Load->getType(); ConstantInt *Offset = dyn_cast<ConstantInt>(GEP->getOperand(1)); @@ -143,12 +144,12 @@ static void fixI8UseChain(Instruction &I, ReplacedValues[Load] = NewLoad; Load->replaceAllUsesWith(NewLoad); ToRemove.push_back(Load); - return; + return true; } if (auto *BO = dyn_cast<BinaryOperator>(&I)) { if (!I.getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewInst = @@ -162,24 +163,24 @@ static void fixI8UseChain(Instruction &I, } ReplacedValues[BO] = NewInst; ToRemove.push_back(BO); - return; + return true; } if (auto *Sel = dyn_cast<SelectInst>(&I)) { if (!I.getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewInst = Builder.CreateSelect(Sel->getCondition(), NewOperands[1], NewOperands[2]); ReplacedValues[Sel] = NewInst; ToRemove.push_back(Sel); - return; + return true; } if (auto *Cmp = dyn_cast<CmpInst>(&I)) { if (!Cmp->getOperand(0)->getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewInst = @@ -187,18 +188,18 @@ static void fixI8UseChain(Instruction &I, Cmp->replaceAllUsesWith(NewInst); ReplacedValues[Cmp] = NewInst; ToRemove.push_back(Cmp); - return; + return true; } if (auto *Cast = dyn_cast<CastInst>(&I)) { if (!Cast->getSrcTy()->isIntegerTy(8)) - return; + return false; ToRemove.push_back(Cast); auto *Replacement = ReplacedValues[Cast->getOperand(0)]; if (Cast->getType() == Replacement->getType()) { Cast->replaceAllUsesWith(Replacement); - return; + return true; } Value *AdjustedCast = nullptr; @@ -213,7 +214,7 @@ static void fixI8UseChain(Instruction &I, if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { if (!GEP->getType()->isPointerTy() || !GEP->getSourceElementType()->isIntegerTy(8)) - return; + return false; Value *BasePtr = GEP->getPointerOperand(); if (ReplacedValues.count(BasePtr)) @@ -248,15 +249,17 @@ static void fixI8UseChain(Instruction &I, ReplacedValues[GEP] = NewGEP; GEP->replaceAllUsesWith(NewGEP); ToRemove.push_back(GEP); + return true; } + return false; } -static void upcastI8AllocasAndUses(Instruction &I, +static bool upcastI8AllocasAndUses(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { auto *AI = dyn_cast<AllocaInst>(&I); if (!AI || !AI->getAllocatedType()->isIntegerTy(8)) - return; + return false; Type *SmallestType = nullptr; @@ -291,16 +294,17 @@ static void upcastI8AllocasAndUses(Instruction &I, } if (!SmallestType) - return; // no valid casts found + return false; // no valid casts found // Replace alloca IRBuilder<> Builder(AI); auto *NewAlloca = Builder.CreateAlloca(SmallestType); ReplacedValues[AI] = NewAlloca; ToRemove.push_back(AI); + return true; } -static void +static bool downcastI64toI32InsertExtractElements(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { @@ -318,6 +322,7 @@ downcastI64toI32InsertExtractElements(Instruction &I, Extract->replaceAllUsesWith(NewExtract); ToRemove.push_back(Extract); + return true; } } @@ -335,8 +340,10 @@ downcastI64toI32InsertExtractElements(Instruction &I, Insert->replaceAllUsesWith(Insert32Index); ToRemove.push_back(Insert); + return true; } } + return false; } static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, @@ -453,17 +460,17 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val, // Expands the instruction `I` into corresponding loads and stores if it is a // memcpy call. In that case, the call instruction is added to the `ToRemove` // vector. `ReplacedValues` is unused. -static void legalizeMemCpy(Instruction &I, +static bool legalizeMemCpy(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { CallInst *CI = dyn_cast<CallInst>(&I); if (!CI) - return; + return false; Intrinsic::ID ID = CI->getIntrinsicID(); if (ID != Intrinsic::memcpy) - return; + return false; IRBuilder<> Builder(&I); Value *Dst = CI->getArgOperand(0); @@ -476,19 +483,20 @@ static void legalizeMemCpy(Instruction &I, assert(IsVolatile->getZExtValue() == 0 && "Expected IsVolatile to be false"); emitMemcpyExpansion(Builder, Dst, Src, Length); ToRemove.push_back(CI); + return true; } -static void legalizeMemSet(Instruction &I, +static bool legalizeMemSet(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { CallInst *CI = dyn_cast<CallInst>(&I); if (!CI) - return; + return false; Intrinsic::ID ID = CI->getIntrinsicID(); if (ID != Intrinsic::memset) - return; + return false; IRBuilder<> Builder(&I); Value *Dst = CI->getArgOperand(0); @@ -497,23 +505,25 @@ static void legalizeMemSet(Instruction &I, assert(Size && "Expected Size to be a ConstantInt"); emitMemsetExpansion(Builder, Dst, Val, Size, ReplacedValues); ToRemove.push_back(CI); + return true; } -static void updateFnegToFsub(Instruction &I, +static bool updateFnegToFsub(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { const Intrinsic::ID ID = I.getOpcode(); if (ID != Instruction::FNeg) - return; + return false; IRBuilder<> Builder(&I); Value *In = I.getOperand(0); Value *Zero = ConstantFP::get(In->getType(), -0.0); I.replaceAllUsesWith(Builder.CreateFSub(Zero, In)); ToRemove.push_back(&I); + return true; } -static void +static bool legalizeGetHighLowi64Bytes(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { @@ -523,13 +533,13 @@ legalizeGetHighLowi64Bytes(Instruction &I, BitCast->getSrcTy()->isIntegerTy(64)) { ToRemove.push_back(BitCast); ReplacedValues[BitCast] = BitCast->getOperand(0); - return; + return true; } } if (auto *Extract = dyn_cast<ExtractElementInst>(&I)) { if (!dyn_cast<BitCastInst>(Extract->getVectorOperand())) - return; + return false; auto *VecTy = dyn_cast<FixedVectorType>(Extract->getVectorOperandType()); if (VecTy && VecTy->getElementType()->isIntegerTy(32) && VecTy->getNumElements() == 2) { @@ -557,12 +567,14 @@ legalizeGetHighLowi64Bytes(Instruction &I, } ToRemove.push_back(Extract); Extract->replaceAllUsesWith(ReplacedValues[Extract]); + return true; } } } + return false; } -static void +static bool legalizeScalarLoadStoreOnArrays(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { @@ -579,14 +591,14 @@ legalizeScalarLoadStoreOnArrays(Instruction &I, PtrOpIndex = SI->getPointerOperandIndex(); LoadStoreTy = SI->getValueOperand()->getType(); } else - return; + return false; // If the load/store is not of a single-value type (i.e., scalar or vector) // then we do not modify it. It shouldn't be a vector either because the // dxil-data-scalarization pass is expected to run before this, but it's not // incorrect to apply this transformation to vector load/stores. if (!LoadStoreTy->isSingleValueType()) - return; + return false; Type *ArrayTy; if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp)) @@ -594,10 +606,10 @@ legalizeScalarLoadStoreOnArrays(Instruction &I, else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp)) ArrayTy = AllocaPtrOp->getAllocatedType(); else - return; + return false; if (!isa<ArrayType>(ArrayTy)) - return; + return false; assert(ArrayTy->getArrayElementType() == LoadStoreTy && "Expected array element type to be the same as to the scalar load or " @@ -607,6 +619,7 @@ legalizeScalarLoadStoreOnArrays(Instruction &I, Value *GEP = GetElementPtrInst::Create( ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator()); I.setOperand(PtrOpIndex, GEP); + return true; } namespace { @@ -624,13 +637,11 @@ public: ReplacedValues.clear(); for (auto &I : instructions(F)) { for (auto &LegalizationFn : LegalizationPipeline[Stage]) - LegalizationFn(I, ToRemove, ReplacedValues); + MadeChange |= LegalizationFn(I, ToRemove, ReplacedValues); } for (auto *Inst : reverse(ToRemove)) Inst->eraseFromParent(); - - MadeChange |= !ToRemove.empty(); } return MadeChange; } @@ -639,7 +650,7 @@ private: enum LegalizationStage { Stage1 = 0, Stage2 = 1, NumStages }; using LegalizationFnTy = - std::function<void(Instruction &, SmallVectorImpl<Instruction *> &, + std::function<bool(Instruction &, SmallVectorImpl<Instruction *> &, DenseMap<Value *, Value *> &)>; SmallVector<LegalizationFnTy> LegalizationPipeline[NumStages]; diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 566f3a9..c33ec0e 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -241,7 +241,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) { } static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) { - bool Changed = false; SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources; for (BasicBlock &BB : F) for (Instruction &I : BB) @@ -254,7 +253,7 @@ static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) { for (auto &[II, RI] : Resources) replaceAccess(II, RI); - return Changed; + return !Resources.empty(); } PreservedAnalyses DXILResourceAccess::run(Function &F, diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index eb4adfe..e7e7f2c 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -106,11 +106,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, DXILResourceTypeMap &DRTM, const ModuleMetadataInfo &MMDI) { if (!CSF.Doubles) - CSF.Doubles = I.getType()->isDoubleTy(); + CSF.Doubles = I.getType()->getScalarType()->isDoubleTy(); if (!CSF.Doubles) { for (const Value *Op : I.operands()) { - if (Op->getType()->isDoubleTy()) { + if (Op->getType()->getScalarType()->isDoubleTy()) { CSF.Doubles = true; break; } @@ -130,12 +130,13 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, } if (!CSF.LowPrecisionPresent) - CSF.LowPrecisionPresent = - I.getType()->isIntegerTy(16) || I.getType()->isHalfTy(); + CSF.LowPrecisionPresent = I.getType()->getScalarType()->isIntegerTy(16) || + I.getType()->getScalarType()->isHalfTy(); if (!CSF.LowPrecisionPresent) { for (const Value *Op : I.operands()) { - if (Op->getType()->isIntegerTy(16) || Op->getType()->isHalfTy()) { + if (Op->getType()->getScalarType()->isIntegerTy(16) || + Op->getType()->getScalarType()->isHalfTy()) { CSF.LowPrecisionPresent = true; break; } @@ -150,11 +151,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, } if (!CSF.Int64Ops) - CSF.Int64Ops = I.getType()->isIntegerTy(64); + CSF.Int64Ops = I.getType()->getScalarType()->isIntegerTy(64); if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) { for (const Value *Op : I.operands()) { - if (Op->getType()->isIntegerTy(64)) { + if (Op->getType()->getScalarType()->isIntegerTy(64)) { CSF.Int64Ops = true; break; } diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index c86fa2b..54c3cea 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); const Function &F = MF.getFunction(); - bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = F.hasOptSize(); // Combine aggressively (for code size) ShouldCombineAggressively = diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp index 6eccf80..9d7776d 100644 --- a/llvm/lib/Target/Hexagon/HexagonMask.cpp +++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp @@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) { HII = HST.getInstrInfo(); const Function &F = MF.getFunction(); - if (!F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.hasOptSize()) return false; // Mask instruction is available only from v66 if (!HST.hasV66Ops()) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e915a3c4..a5bf0e5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2385,23 +2385,6 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op, return Res; } -static bool isConstantOrUndef(const SDValue Op) { - if (Op->isUndef()) - return true; - if (isa<ConstantSDNode>(Op)) - return true; - if (isa<ConstantFPSDNode>(Op)) - return true; - return false; -} - -static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) { - for (unsigned i = 0; i < Op->getNumOperands(); ++i) - if (isConstantOrUndef(Op->getOperand(i))) - return true; - return false; -} - // Lower BUILD_VECTOR as broadcast load (if possible). // For example: // %a = load i8, ptr %ptr @@ -2451,10 +2434,14 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op); EVT ResTy = Op->getValueType(0); + unsigned NumElts = ResTy.getVectorNumElements(); SDLoc DL(Op); APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; + bool IsConstant = false; + bool UseSameConstant = true; + SDValue ConstantValue; bool Is128Vec = ResTy.is128BitVector(); bool Is256Vec = ResTy.is256BitVector(); @@ -2505,20 +2492,45 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, if (DAG.isSplatValue(Op, /*AllowUndefs=*/false)) return Op; - if (!isConstantOrUndefBUILD_VECTOR(Node)) { + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Opi = Node->getOperand(i); + if (isIntOrFPConstant(Opi)) { + IsConstant = true; + if (!ConstantValue.getNode()) + ConstantValue = Opi; + else if (ConstantValue != Opi) + UseSameConstant = false; + } + } + + // If the type of BUILD_VECTOR is v2f64, custom legalizing it has no benefits. + if (IsConstant && UseSameConstant && ResTy != MVT::v2f64) { + SDValue Result = DAG.getSplatBuildVector(ResTy, DL, ConstantValue); + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Opi = Node->getOperand(i); + if (!isIntOrFPConstant(Opi)) + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Result, Opi, + DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + } + return Result; + } + + if (!IsConstant) { // Use INSERT_VECTOR_ELT operations rather than expand to stores. // The resulting code is the same length as the expansion, but it doesn't // use memory operations. - EVT ResTy = Node->getValueType(0); - assert(ResTy.isVector()); - unsigned NumElts = ResTy.getVectorNumElements(); - SDValue Vector = - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0)); + SDValue Op0 = Node->getOperand(0); + SDValue Vector = DAG.getUNDEF(ResTy); + + if (!Op0.isUndef()) + Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0); for (unsigned i = 1; i < NumElts; ++i) { - Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, - Node->getOperand(i), + SDValue Opi = Node->getOperand(i); + if (Opi.isUndef()) + continue; + Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi, DAG.getConstant(i, DL, Subtarget.getGRLenVT())); } return Vector; @@ -2609,9 +2621,38 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - if (isa<ConstantSDNode>(Op->getOperand(2))) + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = EltVT.getScalarSizeInBits(); + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + + if (isa<ConstantSDNode>(Op2)) return Op; - return SDValue(); + + MVT IdxTy = MVT::getIntegerVT(EltSizeInBits); + MVT IdxVTy = MVT::getVectorVT(IdxTy, NumElts); + + if (!isTypeLegal(VT) || !isTypeLegal(IdxVTy)) + return SDValue(); + + SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1); + SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); + + SmallVector<SDValue, 32> RawIndices; + for (unsigned i = 0; i < NumElts; ++i) + RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + + // insert vec, elt, idx + // => + // select (splatidx == {0,1,2...}) ? splatelt : vec + SDValue SelectCC = + DAG.getSetCC(DL, IdxVTy, SplatIdx, Indices, ISD::CondCode::SETEQ); + return DAG.getNode(ISD::VSELECT, DL, VT, SelectCC, SplatElt, Op0); } SDValue LoongArchTargetLowering::lowerATOMIC_FENCE(SDValue Op, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 8fa72bc..d9ea88c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -254,6 +254,7 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup Fixup = MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); F.setVarFixups({Fixup}); + F.setLinkerRelaxable(); F.getParent()->setLinkerRelaxable(); return true; } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index feb4eb3..d9680c7 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -969,7 +969,7 @@ void MipsTargetELFStreamer::finish() { Align Alignment = Section.getAlign(); S.switchSection(&Section); - if (Section.useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(Section)) S.emitCodeAlignment(Alignment, &STI, Alignment.value()); else S.emitValueToAlignment(Alignment, 0, 1, Alignment.value()); diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index ca03310..a2e48ab 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -737,14 +737,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) { if (FS.empty() && M.size() && F->hasFnAttribute("target-features")) FS = F->getFnAttribute("target-features").getValueAsString(); + std::string strFS = FS.str(); + if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool()) + strFS += strFS.empty() ? "+soft-float" : ",+soft-float"; + // Compute MIPS architecture attributes based on the default subtarget // that we'd have constructed. // FIXME: For ifunc related functions we could iterate over and look // for a feature string that doesn't match the default one. StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU()); const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM); - const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, - std::nullopt); + const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(), + MTM, std::nullopt); bool IsABICalls = STI.isABICalls(); const MipsABIInfo &ABI = MTM.getABI(); diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 0e581a7..881ba8e 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Custom); - setOperationAction(ISD::ConstantFP, MVT::f64, Custom); - setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND, ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL, ISD::SIGN_EXTEND}); @@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG); case ISD::READCYCLECOUNTER: return lowerREADCYCLECOUNTER(Op, DAG); - case ISD::ConstantFP: - return lowerConstantFP(Op, DAG); } return SDValue(); } @@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op, return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc); } -SDValue MipsTargetLowering::lowerConstantFP(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getSimpleValueType(); - SDNode *N = Op.getNode(); - ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N); - - if (!CFP->isNaN() || Subtarget.isNaN2008()) { - return SDValue(); - } - - APFloat NaNValue = CFP->getValueAPF(); - auto &Sem = NaNValue.getSemantics(); - - // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN - // encodings, and one for sNaNs. Check every NaN constants and make sure - // they are correctly encoded for legacy encodings. - if (!NaNValue.isSignaling()) { - APFloat RealQNaN = NaNValue.getSNaN(Sem); - return DAG.getConstantFP(RealQNaN, DL, VT); - } - return SDValue(); -} - //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -3370,6 +3341,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool &IsTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -3426,8 +3398,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned StackSize = CCInfo.getStackSize(); - // Call site info for function parameters tracking. + // Call site info for function parameters tracking and call base type info. MachineFunction::CallSiteInfo CSInfo; + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); // Check if it's really possible to do a tail call. Restrict it to functions // that are part of this compilation unit. diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index 31ac5d4..c65c76c 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -592,7 +592,6 @@ class TargetRegisterClass; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; /// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index 614b321..ce9cd12 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -15,8 +15,6 @@ using namespace llvm; -void NVPTXMCAsmInfo::anchor() {} - NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple, const MCTargetOptions &Options) { if (TheTriple.getArch() == Triple::nvptx64) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index 77c4dae..f071406 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -19,8 +19,6 @@ namespace llvm { class Triple; class NVPTXMCAsmInfo : public MCAsmInfo { - virtual void anchor(); - public: explicit NVPTXMCAsmInfo(const Triple &TheTriple, const MCTargetOptions &Options); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp index 9f91143..329e3b5 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp @@ -97,10 +97,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection, if (isDwarfSection(FI, Section)) { // Emit DWARF .file directives in the outermost scope. outputDwarfFileDirectives(); - OS << "\t.section"; - Section->printSwitchToSection(*getStreamer().getContext().getAsmInfo(), - getStreamer().getContext().getTargetTriple(), - OS, SubSection); + OS << "\t.section\t" << Section->getName() << '\n'; // DWARF sections are enclosed into braces - emit the open one. OS << "\t{\n"; HasSections = true; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 65e7c56..95abcde 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -56,9 +56,7 @@ INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm) { - doMulWide = (OptLevel > CodeGenOptLevel::None); -} + : SelectionDAGISel(tm, OptLevel), TM(tm) {} bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); @@ -145,18 +143,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryStoreVector(N)) return; break; - case NVPTXISD::LoadParam: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadParamV4: - if (tryLoadParam(N)) - return; - break; - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - if (tryStoreParam(N)) - return; - break; case ISD::INTRINSIC_W_CHAIN: if (tryIntrinsicChain(N)) return; @@ -1462,267 +1448,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return true; } -bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { - SDValue Chain = Node->getOperand(0); - SDValue Offset = Node->getOperand(2); - SDValue Glue = Node->getOperand(3); - SDLoc DL(Node); - MemSDNode *Mem = cast<MemSDNode>(Node); - - unsigned VecSize; - switch (Node->getOpcode()) { - default: - return false; - case NVPTXISD::LoadParam: - VecSize = 1; - break; - case NVPTXISD::LoadParamV2: - VecSize = 2; - break; - case NVPTXISD::LoadParamV4: - VecSize = 4; - break; - } - - EVT EltVT = Node->getValueType(0); - EVT MemVT = Mem->getMemoryVT(); - - std::optional<unsigned> Opcode; - - switch (VecSize) { - default: - return false; - case 1: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, - NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64); - break; - case 2: - Opcode = - pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, - NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, - NVPTX::LoadParamMemV2I64); - break; - case 4: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16, - NVPTX::LoadParamMemV4I32, {/* no v4i64 */}); - break; - } - if (!Opcode) - return false; - - SDVTList VTs; - if (VecSize == 1) { - VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); - } else if (VecSize == 2) { - VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); - } else { - EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; - VTs = CurDAG->getVTList(EVTs); - } - - unsigned OffsetVal = Offset->getAsZExtVal(); - - SmallVector<SDValue, 2> Ops( - {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); - return true; -} - -// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri) -#define getOpcV2H(ty, opKind0, opKind1) \ - NVPTX::StoreParamV2##ty##_##opKind0##opKind1 - -#define getOpcV2H1(ty, opKind0, isImm1) \ - (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r) - -#define getOpcodeForVectorStParamV2(ty, isimm) \ - (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1]) - -#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \ - NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3 - -#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \ - (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \ - : getOpcV4H(ty, opKind0, opKind1, opKind2, r) - -#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \ - (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \ - : getOpcV4H3(ty, opKind0, opKind1, r, isImm3) - -#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \ - (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \ - : getOpcV4H2(ty, opKind0, r, isImm2, isImm3) - -#define getOpcodeForVectorStParamV4(ty, isimm) \ - (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \ - : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3]) - -#define getOpcodeForVectorStParam(n, ty, isimm) \ - (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \ - : getOpcodeForVectorStParamV4(ty, isimm) - -static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, - unsigned NumElts, - MVT::SimpleValueType MemTy, - SelectionDAG *CurDAG, SDLoc DL) { - // Determine which inputs are registers and immediates make new operators - // with constant values - SmallVector<bool, 4> IsImm(NumElts, false); - for (unsigned i = 0; i < NumElts; i++) { - IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i])); - if (IsImm[i]) { - SDValue Imm = Ops[i]; - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[i] = Imm; - } - } - - // Get opcode for MemTy, size, and register/immediate operand ordering - switch (MemTy) { - case MVT::i8: - return getOpcodeForVectorStParam(NumElts, I8, IsImm); - case MVT::i16: - return getOpcodeForVectorStParam(NumElts, I16, IsImm); - case MVT::i32: - return getOpcodeForVectorStParam(NumElts, I32, IsImm); - case MVT::i64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(I64, IsImm); - case MVT::f32: - return getOpcodeForVectorStParam(NumElts, F32, IsImm); - case MVT::f64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(F64, IsImm); - - // These cases don't support immediates, just use the all register version - // and generate moves. - case MVT::i1: - return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr - : NVPTX::StoreParamV4I8_rrrr; - case MVT::f16: - case MVT::bf16: - return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr - : NVPTX::StoreParamV4I16_rrrr; - case MVT::v2f16: - case MVT::v2bf16: - case MVT::v2i16: - case MVT::v4i8: - return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr - : NVPTX::StoreParamV4I32_rrrr; - default: - llvm_unreachable("Cannot select st.param for unknown MemTy"); - } -} - -bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Param = N->getOperand(1); - unsigned ParamVal = Param->getAsZExtVal(); - SDValue Offset = N->getOperand(2); - unsigned OffsetVal = Offset->getAsZExtVal(); - MemSDNode *Mem = cast<MemSDNode>(N); - SDValue Glue = N->getOperand(N->getNumOperands() - 1); - - // How many elements do we have? - unsigned NumElts; - switch (N->getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case NVPTXISD::StoreParam: - NumElts = 1; - break; - case NVPTXISD::StoreParamV2: - NumElts = 2; - break; - case NVPTXISD::StoreParamV4: - NumElts = 4; - break; - } - - // Build vector of operands - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(N->getOperand(i + 3)); - Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), - CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - // Determine target opcode - // If we have an i1, use an 8-bit store. The lowering code in - // NVPTXISelLowering will have already emitted an upcast. - std::optional<unsigned> Opcode; - switch (NumElts) { - default: - llvm_unreachable("Unexpected NumElts"); - case 1: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - SDValue Imm = Ops[0]; - if (MemTy != MVT::f16 && MemTy != MVT::bf16 && - (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) { - // Convert immediate to target constant - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[0] = Imm; - // Use immediate version of store param - Opcode = - pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i, - NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i); - } else - Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, - NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r, - NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r); - if (Opcode == NVPTX::StoreParamI8_r) { - // Fine tune the opcode depending on the size of the operand. - // This helps to avoid creating redundant COPY instructions in - // InstrEmitter::AddRegisterOperand(). - switch (Ops[0].getSimpleValueType().SimpleTy) { - default: - break; - case MVT::i32: - Opcode = NVPTX::StoreParamI8TruncI32_r; - break; - case MVT::i64: - Opcode = NVPTX::StoreParamI8TruncI64_r; - break; - } - } - break; - } - case 2: - case 4: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL); - break; - } - } - - SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); - MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); - - ReplaceNode(N, Ret); - return true; -} - /// SelectBFE - Look for instruction sequences that can be made more efficient /// by using the 'bfe' (bit-field extract) PTX instruction bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b99b4ef..9e0f88e5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -40,9 +40,6 @@ private: class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { const NVPTXTargetMachine &TM; - // If true, generate mul.wide from sext and mul - bool doMulWide; - NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const; bool usePrecSqrtF32(const SDNode *N) const; bool useF32FTZ() const; @@ -78,8 +75,6 @@ private: bool tryLDG(MemSDNode *N); bool tryStore(SDNode *N); bool tryStoreVector(SDNode *N); - bool tryLoadParam(SDNode *N); - bool tryStoreParam(SDNode *N); bool tryFence(SDNode *N); void SelectAddrSpaceCast(SDNode *N); bool tryBFE(SDNode *N); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f2c2f46..4fd3623 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -843,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT, ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD, - ISD::STORE}); + ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -952,10 +952,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // promoted to f32. v2f16 is expanded to f16, which is then promoted // to f32. for (const auto &Op : - {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { + {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal); - setOperationAction(Op, MVT::f64, Legal); + // only div/rem/sqrt are legal for f64 + if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) { + setOperationAction(Op, MVT::f64, Legal); + } setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand); setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32); @@ -1072,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) MAKE_CASE(NVPTXISD::CALL) - MAKE_CASE(NVPTXISD::LoadParam) - MAKE_CASE(NVPTXISD::LoadParamV2) - MAKE_CASE(NVPTXISD::LoadParamV4) - MAKE_CASE(NVPTXISD::StoreParam) - MAKE_CASE(NVPTXISD::StoreParamV2) - MAKE_CASE(NVPTXISD::StoreParamV4) MAKE_CASE(NVPTXISD::MoveParam) MAKE_CASE(NVPTXISD::UNPACK_VECTOR) MAKE_CASE(NVPTXISD::BUILD_VECTOR) @@ -1315,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } -static bool adjustElementType(EVT &ElementType) { - switch (ElementType.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::f16: - case MVT::bf16: - ElementType = MVT::i16; - return true; - case MVT::f32: - case MVT::v2f16: - case MVT::v2bf16: - ElementType = MVT::i32; - return true; - case MVT::f64: - ElementType = MVT::i64; - return true; - } -} - -// Use byte-store when the param address of the argument value is unaligned. -// This may happen when the return value is a field of a packed structure. -// -// This is called in LowerCall() when passing the param values. -static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, - uint64_t Offset, EVT ElementType, - SDValue StVal, SDValue &InGlue, - unsigned ArgID, const SDLoc &dl) { - // Bit logic only works on integer types - if (adjustElementType(ElementType)) - StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); - - // Store each byte - SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - // Shift the byte to the last byte position - SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, - DAG.getConstant(i * 8, dl, MVT::i32)); - SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - ShiftVal, InGlue}; - // Trunc store only the last byte by using - // st.param.b8 - // The register type can be larger than b8. - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, - MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); - } - return Chain; -} - -// Use byte-load when the param adress of the returned value is unaligned. -// This may happen when the returned value is a field of a packed structure. -static SDValue -LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, - EVT ElementType, SDValue &InGlue, - SmallVectorImpl<SDValue> &TempProxyRegOps, - const SDLoc &dl) { - // Bit logic only works on integer types - EVT MergedType = ElementType; - adjustElementType(MergedType); - - // Load each byte and construct the whole value. Initial value to 0 - SDValue RetVal = DAG.getConstant(0, dl, MergedType); - // LoadParamMemI8 loads into i16 register only - SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - InGlue}; - // This will be selected to LoadParamMemI8 - SDValue LdVal = - DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, - MVT::i8, MachinePointerInfo(), Align(1)); - SDValue TmpLdVal = LdVal.getValue(0); - Chain = LdVal.getValue(1); - InGlue = LdVal.getValue(2); - - TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, - TmpLdVal.getSimpleValueType(), TmpLdVal); - TempProxyRegOps.push_back(TmpLdVal); - - SDValue CMask = DAG.getConstant(255, dl, MergedType); - SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); - // Need to extend the i16 register to the whole width. - TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); - // Mask off the high bits. Leave only the lower 8bits. - // Do this because we are using loadparam.b8. - TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); - // Shift and merge - TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); - RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); - } - if (ElementType != MergedType) - RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); - - return RetVal; -} - static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func) { if (!Func) @@ -1480,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SelectionDAG &DAG = CLI.DAG; SDLoc dl = CLI.DL; - SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; + const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Callee = CLI.Callee; - bool &isTailCall = CLI.IsTailCall; ArgListTy &Args = CLI.getArgs(); Type *RetTy = CLI.RetTy; const CallBase *CB = CLI.CB; @@ -1493,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return DAG.getConstant(I, dl, MVT::i32); }; + const unsigned UniqueCallSite = GlobalUniqueCallSite++; + const SDValue CallChain = CLI.Chain; + const SDValue StartChain = + DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl); + SDValue DeclareGlue = StartChain.getValue(1); + + SmallVector<SDValue, 16> CallPrereqs{StartChain}; + + const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) { + // PTX ABI requires integral types to be at least 32 bits in size. FP16 is + // loaded/stored using i16, so it's handled here as well. + const unsigned SizeBits = promoteScalarArgumentSize(Size * 8); + SDValue Declare = + DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(SizeBits), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + + const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align, + unsigned Size) { + SDValue Declare = DAG.getNode( + NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + // Variadic arguments. // // Normally, for each argument, we declare a param scalar or a param @@ -1508,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // // After all vararg is processed, 'VAOffset' holds the size of the // vararg byte array. + assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) && + "Non-VarArg function with extra arguments"); - SDValue VADeclareParam; // vararg byte array const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic - unsigned VAOffset = 0; // current offset in the param array + unsigned VAOffset = 0; // current offset in the param array - const unsigned UniqueCallSite = GlobalUniqueCallSite++; - SDValue TempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); - SDValue InGlue = Chain.getValue(1); + const SDValue VADeclareParam = + CLI.Args.size() > FirstVAArg + ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32), + Align(STI.getMaxRequiredAlignment()), 0) + : SDValue(); // Args.size() and Outs.size() need not match. // Outs.size() will be larger @@ -1577,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && "type size mismatch"); - const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> { - if (IsVAArg) { - if (ArgI == FirstVAArg) { - VADeclareParam = DAG.getNode( - NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()), - GetI32(0), InGlue}); - return VADeclareParam; - } - return std::nullopt; - } - if (IsByVal || shouldPassAsArray(Arg.Ty)) { - // declare .param .align <align> .b8 .param<n>[<size>]; - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(ArgAlign.value()), - GetI32(TypeSize), InGlue}); - } + const SDValue ArgDeclare = [&]() { + if (IsVAArg) + return VADeclareParam; + + if (IsByVal || shouldPassAsArray(Arg.Ty)) + return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize); + assert(ArgOuts.size() == 1 && "We must pass only one value as non-array"); - // declare .param .b<size> .param<n>; - - // PTX ABI requires integral types to be at least 32 bits in - // size. FP16 is loaded/stored using i16, so it's handled - // here as well. - const unsigned PromotedSize = - (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) - ? promoteScalarArgumentSize(TypeSize * 8) - : TypeSize * 8; - - return DAG.getNode(NVPTXISD::DeclareScalarParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(PromotedSize), InGlue}); + assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) && + "Only int and float types are supported as non-array arguments"); + + return MakeDeclareScalarParam(ParamSymbol, TypeSize); }(); - if (ArgDeclare) { - Chain = ArgDeclare->getValue(0); - InGlue = ArgDeclare->getValue(1); - } // PTX Interoperability Guide 3.3(A): [Integer] Values shorter // than 32-bits are sign extended or zero extended, depending on @@ -1623,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32; const auto GetStoredValue = [&](const unsigned I, EVT EltVT, - const Align PartAlign) { - SDValue StVal; + const MaybeAlign PartAlign) { if (IsByVal) { SDValue Ptr = ArgOutVals[0]; auto MPI = refinePtrAS(Ptr, DAG, DL, *this); SDValue SrcAddr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I])); - StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign); - } else { - StVal = ArgOutVals[I]; - - auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType()); - if (PromotedVT != StVal.getValueType()) { - StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT, - StVal); - } + return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign); } + SDValue StVal = ArgOutVals[I]; + assert(promoteScalarIntegerPTX(StVal.getValueType()) == + StVal.getValueType() && + "OutVal type should always be legal"); - if (ExtendIntegerParam) { - assert(VTs.size() == 1 && "Scalar can't have multiple parts."); - // zext/sext to i32 - StVal = - DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal); - } else if (EltVT.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - return StVal; + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT StoreVT = + ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); + + return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl); }; const auto VectorInfo = @@ -1661,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned J = 0; for (const unsigned NumElts : VectorInfo) { const int CurOffset = Offsets[J]; - EVT EltVT = promoteScalarIntegerPTX(VTs[J]); - const Align PartAlign = commonAlignment(ArgAlign, CurOffset); - - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar store. In such cases, fall back to byte stores. - if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) { - - SDValue StVal = GetStoredValue(J, EltVT, PartAlign); - Chain = LowerUnalignedStoreParam(DAG, Chain, - CurOffset + (IsByVal ? VAOffset : 0), - EltVT, StVal, InGlue, ArgI, dl); - - // LowerUnalignedStoreParam took care of inserting the necessary nodes - // into the SDAG, so just move on to the next element. - J++; - continue; - } + const EVT EltVT = promoteScalarIntegerPTX(VTs[J]); if (IsVAArg && !IsByVal) // Align each part of the variadic argument to their type. @@ -1685,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((IsVAArg || VAOffset == 0) && "VAOffset must be 0 for non-VA args"); - SmallVector<SDValue, 6> StoreOperands{ - Chain, GetI32(IsVAArg ? FirstVAArg : ArgI), - GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))}; - // Record the values to store. - for (const unsigned K : llvm::seq(NumElts)) - StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign)); - StoreOperands.push_back(InGlue); + const unsigned Offset = + (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset)); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset)); - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); + const MaybeAlign CurrentAlign = ExtendIntegerParam + ? MaybeAlign(std::nullopt) + : commonAlignment(ArgAlign, Offset); + + SDValue Val; + if (NumElts == 1) { + Val = GetStoredValue(J, EltVT, CurrentAlign); + } else { + SmallVector<SDValue, 8> StoreVals; + for (const unsigned K : llvm::seq(NumElts)) { + SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign); + if (ValJ.getValueType().isVector()) + DAG.ExtractVectorElements(ValJ, StoreVals); + else + StoreVals.push_back(ValJ); + } + + EVT VT = EVT::getVectorVT( + *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size()); + Val = DAG.getBuildVector(VT, dl, StoreVals); } - // Adjust type of the store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), PartAlign, - MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); + SDValue StoreParam = + DAG.getStore(ArgDeclare, dl, Val, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); + CallPrereqs.push_back(StoreParam); // TODO: We may need to support vector types that can be passed // as scalars in variadic arguments. if (IsVAArg && !IsByVal) { assert(NumElts == 1 && "Vectorization is expected to be disabled for variadics."); + const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext())); } @@ -1733,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VAOffset += TypeSize; } - GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); - // Handle Result if (!Ins.empty()) { - const SDValue RetDeclare = [&]() { - const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); - const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy); - if (shouldPassAsArray(RetTy)) { - const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(RetAlign.value()), - GetI32(ResultSize / 8), InGlue}); - } - const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize); - return DAG.getNode( - NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue}); - }(); - Chain = RetDeclare.getValue(0); - InGlue = RetDeclare.getValue(1); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + const unsigned ResultSize = DL.getTypeAllocSize(RetTy); + if (shouldPassAsArray(RetTy)) { + const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); + MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize); + } else { + MakeDeclareScalarParam(RetSymbol, ResultSize); + } } - const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); // Set the size of the vararg param byte array if the callee is a variadic // function and the variadic part is not empty. - if (HasVAArgs) { + if (VADeclareParam) { SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), VADeclareParam.getOperand(2), GetI32(VAOffset), @@ -1768,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VADeclareParam->getVTList(), DeclareParamOps); } + const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); // If the type of the callsite does not match that of the function, convert // the callsite to an indirect call. const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func); @@ -1797,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // instruction. // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. + const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); std::string Proto = getPrototype(DL, RetTy, Args, CLI.Outs, HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB, UniqueCallSite); const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); - Chain = DAG.getNode( - NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue}, - {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue}); - InGlue = Chain.getValue(1); + const SDValue PrototypeDeclare = DAG.getNode( + NVPTXISD::CallPrototype, dl, MVT::Other, + {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)}); + CallPrereqs.push_back(PrototypeDeclare); } if (ConvertToIndirectCall) { @@ -1823,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const unsigned NumArgs = std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size()); /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) - Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), - GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, - GetI32(Proto), InGlue}); - InGlue = Chain.getValue(1); - + /// NumParams, Callee, Proto) + const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs); + const SDValue Call = DAG.getNode( + NVPTXISD::CALL, dl, MVT::Other, + {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), + GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)}); + + SmallVector<SDValue, 16> LoadChains{Call}; SmallVector<SDValue, 16> ProxyRegOps; - // An item of the vector is filled if the element does not need a ProxyReg - // operation on it and should be added to InVals as is. ProxyRegOps and - // ProxyRegTruncates contain empty/none items at the same index. - SmallVector<SDValue, 16> RetElts; - // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` - // to use the values of `LoadParam`s and to be replaced later then - // `CALLSEQ_END` is added. - SmallVector<SDValue, 16> TempProxyRegOps; - - // Generate loads from param memory/moves from registers for result if (!Ins.empty()) { SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; @@ -1857,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; - for (const unsigned VectorizedSize : VectorInfo) { - EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]); - EVT EltType = Ins[I].VT; - const Align EltAlign = commonAlignment(RetAlign, Offsets[I]); - - if (TheLoadType != VTs[I]) - EltType = TheLoadType; - - if (ExtendIntegerRetVal) { - TheLoadType = MVT::i32; - EltType = MVT::i32; - } else if (TheLoadType.getSizeInBits() < 16) { - EltType = MVT::i16; - } + for (const unsigned NumElts : VectorInfo) { + const MaybeAlign CurrentAlign = + ExtendIntegerRetVal ? MaybeAlign(std::nullopt) + : commonAlignment(RetAlign, Offsets[I]); - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar load. In such cases, fall back to byte loads. - if (VectorizedSize == 1 && RetTy->isAggregateType() && - EltAlign < DAG.getEVTAlign(TheLoadType)) { - SDValue Ret = LowerUnalignedLoadRetParam( - DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl); - ProxyRegOps.push_back(SDValue()); - RetElts.resize(I); - RetElts.push_back(Ret); - - I++; - continue; - } + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT LoadVT = + ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType); - LoadVTs.append({MVT::Other, MVT::Glue}); + const unsigned PackingAmt = + LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - NVPTXISD::NodeType Op; - switch (VectorizedSize) { - case 1: - Op = NVPTXISD::LoadParam; - break; - case 2: - Op = NVPTXISD::LoadParamV2; - break; - case 4: - Op = NVPTXISD::LoadParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } + const EVT VecVT = NumElts == 1 ? LoadVT + : EVT::getVectorVT(*DAG.getContext(), + LoadVT.getScalarType(), + NumElts * PackingAmt); - SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue}; - SDValue RetVal = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, - MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); - for (const unsigned J : llvm::seq(VectorizedSize)) { - ProxyRegOps.push_back(RetVal.getValue(J)); - } + SDValue R = + DAG.getLoad(VecVT, dl, Call, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); - Chain = RetVal.getValue(VectorizedSize); - InGlue = RetVal.getValue(VectorizedSize + 1); + LoadChains.push_back(R.getValue(1)); - I += VectorizedSize; + if (NumElts == 1) + ProxyRegOps.push_back(R); + else + for (const unsigned J : llvm::seq(NumElts)) { + SDValue Elt = DAG.getNode( + LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR + : ISD::EXTRACT_VECTOR_ELT, + dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl)); + ProxyRegOps.push_back(Elt); + } + I += NumElts; } } - Chain = - DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); - InGlue = Chain.getValue(1); + const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains); + const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite, + UniqueCallSite + 1, SDValue(), dl); // Append ProxyReg instructions to the chain to make sure that `callseq_end` // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. - for (const unsigned I : llvm::seq(ProxyRegOps.size())) { - if (I < RetElts.size() && RetElts[I]) { - InVals.push_back(RetElts[I]); - continue; - } - - SDValue Ret = - DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(), - {Chain, ProxyRegOps[I]}); - - const EVT ExpectedVT = Ins[I].VT; - if (!Ret.getValueType().bitsEq(ExpectedVT)) { - Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret); - } + for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) { + SDValue Proxy = + DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg}); + SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl); InVals.push_back(Ret); } - for (SDValue &T : TempProxyRegOps) { - SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(), - {Chain, T.getOperand(0)}); - DAG.ReplaceAllUsesWith(T, Repl); - DAG.RemoveDeadNode(T.getNode()); - } - - // set isTailCall to false for now, until we figure out how to express + // set IsTailCall to false for now, until we figure out how to express // tail call optimization in PTX - isTailCall = false; - return Chain; + CLI.IsTailCall = false; + return CallEnd; } SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, @@ -5114,10 +4934,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { Operands.push_back(DCI.DAG.getIntPtrConstant( cast<LoadSDNode>(LD)->getExtensionType(), DL)); break; - case NVPTXISD::LoadParamV2: - OldNumOutputs = 2; - Opcode = NVPTXISD::LoadParamV4; - break; case NVPTXISD::LoadV2: OldNumOutputs = 2; Opcode = NVPTXISD::LoadV4; @@ -5198,12 +5014,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV2; break; - case NVPTXISD::StoreParam: - Opcode = NVPTXISD::StoreParamV2; - break; - case NVPTXISD::StoreParamV2: - Opcode = NVPTXISD::StoreParamV4; - break; case NVPTXISD::StoreV2: MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV4; @@ -5215,7 +5025,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, return SDValue(); Opcode = NVPTXISD::StoreV8; break; - case NVPTXISD::StoreParamV4: case NVPTXISD::StoreV8: // PTX doesn't support the next doubling of operands return SDValue(); @@ -5260,30 +5069,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT, ST->getMemOperand()); } -static SDValue PerformStoreCombineHelper(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - unsigned Front, unsigned Back) { - if (all_of(N->ops().drop_front(Front).drop_back(Back), - [](const SDUse &U) { return U.get()->isUndef(); })) - // Operand 0 is the previous value in the chain. Cannot return EntryToken - // as the previous value will become unused and eliminated later. - return N->getOperand(0); - - return combinePackingMovIntoStore(N, DCI, Front, Back); -} - static SDValue PerformStoreCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return combinePackingMovIntoStore(N, DCI, 1, 2); } -static SDValue PerformStoreParamCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // Operands from the 3rd to the 2nd last one are the values to be stored. - // {Chain, ArgID, Offset, Val, Glue} - return PerformStoreCombineHelper(N, DCI, 3, 1); -} - /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -5429,6 +5219,42 @@ static SDValue PerformREMCombine(SDNode *N, return SDValue(); } +// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y) +static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.hasOneUse()) + return SDValue(); + EVT ToVT = N->getValueType(0); + EVT FromVT = Op.getValueType(); + if (!((ToVT == MVT::i32 && FromVT == MVT::i16) || + (ToVT == MVT::i64 && FromVT == MVT::i32))) + return SDValue(); + if (!(Op.getOpcode() == ISD::MUL || + (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1))))) + return SDValue(); + + SDLoc DL(N); + unsigned ExtOpcode = N->getOpcode(); + unsigned Opcode = 0; + if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_SIGNED; + else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap()) + Opcode = NVPTXISD::MUL_WIDE_UNSIGNED; + else + return SDValue(); + SDValue RHS = Op.getOperand(1); + if (Op.getOpcode() == ISD::SHL) { + const auto ShiftAmt = Op.getConstantOperandVal(1); + const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, DL, ToVT); + } + return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -5939,6 +5765,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, N->getConstantOperandAPInt(2), N->getConstantOperandVal(3)), SDLoc(N), N->getValueType(0)); + return SDValue(); +} + +// During call lowering we wrap the return values in a ProxyReg node which +// depend on the chain value produced by the completed call. This ensures that +// the full call is emitted in cases where libcalls are used to legalize +// operations. To improve the functioning of other DAG combines we pull all +// operations we can through one of these nodes, ensuring that the ProxyReg +// directly wraps a load. That is: +// +// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0))) +// +static SDValue sinkProxyReg(SDValue R, SDValue Chain, + TargetLowering::DAGCombinerInfo &DCI) { + switch (R.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::BITCAST: { + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V); + return SDValue(); + } + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + case ISD::OR: { + if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI)) + if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B); + return SDValue(); + } + case ISD::Constant: + return R; + case ISD::LOAD: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: { + return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(), + {Chain, R}); + } + case ISD::BUILD_VECTOR: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SmallVector<SDValue, 16> Ops; + for (auto &Op : R->ops()) { + SDValue V = sinkProxyReg(Op, Chain, DCI); + if (!V) + return SDValue(); + Ops.push_back(V); + } + return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops); + } + case ISD::EXTRACT_VECTOR_ELT: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R), + R.getValueType(), V, R.getOperand(1)); + return SDValue(); + } + default: + return SDValue(); + } +} + +static SDValue combineProxyReg(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + // If the ProxyReg is not wrapping a load, try to pull the operations through + // the ProxyReg. + if (Reg.getOpcode() != ISD::LOAD) { + if (SDValue V = sinkProxyReg(Reg, Chain, DCI)) + return V; + } return SDValue(); } @@ -5955,6 +5861,9 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return combineADDRSPACECAST(N, DCI); case ISD::AND: return PerformANDCombine(N, DCI); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return combineMulWide(N, DCI, OptLevel); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: @@ -5962,7 +5871,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); case ISD::LOAD: - case NVPTXISD::LoadParamV2: case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: return combineUnpackingMovIntoLoad(N, DCI); @@ -5970,6 +5878,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformMULCombine(N, DCI, OptLevel); case NVPTXISD::PRMT: return combinePRMT(N, DCI, OptLevel); + case NVPTXISD::ProxyReg: + return combineProxyReg(N, DCI); case ISD::SETCC: return PerformSETCCCombine(N, DCI, STI.getSmVersion()); case ISD::SHL: @@ -5977,10 +5887,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SREM: case ISD::UREM: return PerformREMCombine(N, DCI, OptLevel); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); case ISD::STORE: case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -6329,6 +6235,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, Results.push_back(NewValue.getValue(3)); } +static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType()); + + SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT); + SDValue NewProxy = + DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg}); + SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0)); + + Results.push_back(Res); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6346,6 +6268,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( case ISD::CopyFromReg: ReplaceCopyFromReg_128(N, DAG, Results); return; + case NVPTXISD::ProxyReg: + replaceProxyReg(N, DAG, *this, Results); + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 228e2aa..cf72a1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -38,7 +38,7 @@ enum NodeType : unsigned { /// This node represents a PTX call instruction. It's operands are as follows: /// /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) CALL, MoveParam, @@ -84,13 +84,7 @@ enum NodeType : unsigned { StoreV2, StoreV4, StoreV8, - LoadParam, - LoadParamV2, - LoadParamV4, - StoreParam, - StoreParamV2, - StoreParamV4, - LAST_MEMORY_OPCODE = StoreParamV4, + LAST_MEMORY_OPCODE = StoreV8, }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b5df4c6..6000b40 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -125,8 +125,6 @@ def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; def doRsqrtOpt : Predicate<"doRsqrtOpt()">; -def doMulWide : Predicate<"doMulWide">; - def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; @@ -836,36 +834,28 @@ def MULWIDES64 : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.s32">; def MULWIDES64Imm : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.s32">; -def MULWIDES64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.s32">; def MULWIDEU64 : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, B32:$b), "mul.wide.u32">; def MULWIDEU64Imm : BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i32imm:$b), "mul.wide.u32">; -def MULWIDEU64Imm64 : - BasicNVPTXInst<(outs B64:$dst), (ins B32:$a, i64imm:$b), "mul.wide.u32">; def MULWIDES32 : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.s16">; def MULWIDES32Imm : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.s16">; -def MULWIDES32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.s16">; def MULWIDEU32 : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b), "mul.wide.u16">; def MULWIDEU32Imm : BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i16imm:$b), "mul.wide.u16">; -def MULWIDEU32Imm32 : - BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, i32imm:$b), "mul.wide.u16">; -def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; -def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; -def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; +def SDTMulWide : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>; +def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide, [SDNPCommutative]>; +def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide, [SDNPCommutative]>; // Matchers for signed, unsigned mul.wide ISD nodes. -let Predicates = [doMulWide] in { +let Predicates = [hasOptEnabled] in { def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>; def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>; def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>; @@ -877,85 +867,6 @@ let Predicates = [doMulWide] in { def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>; } -// Predicates used for converting some patterns to mul.wide. -def SInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(32); -}]>; - -def UInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(32); -}]>; - -def SInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(16); -}]>; - -def UInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(16); -}]>; - -def IntConst_0_30 : PatLeaf<(imm), [{ - // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(31); -}]>; - -def IntConst_0_14 : PatLeaf<(imm), [{ - // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(15); -}]>; - -def SHL2MUL32 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(32, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); -}]>; - -def SHL2MUL16 : SDNodeXForm<imm, [{ - const APInt &v = N->getAPIntValue(); - APInt temp(16, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); -}]>; - -// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. -let Predicates = [doMulWide] in { - def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDES64Imm $a, (SHL2MUL32 $b))>; - def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)), - (MULWIDEU64Imm $a, (SHL2MUL32 $b))>; - - def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDES32Imm $a, (SHL2MUL16 $b))>; - def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)), - (MULWIDEU32Imm $a, (SHL2MUL16 $b))>; - - // Convert "sign/zero-extend then multiply" to mul.wide. - def : Pat<(mul (sext i32:$a), (sext i32:$b)), - (MULWIDES64 $a, $b)>; - def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>; - - def : Pat<(mul (zext i32:$a), (zext i32:$b)), - (MULWIDEU64 $a, $b)>; - def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>; - - def : Pat<(mul (sext i16:$a), (sext i16:$b)), - (MULWIDES32 $a, $b)>; - def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>; - - def : Pat<(mul (zext i16:$a), (zext i16:$b)), - (MULWIDEU32 $a, $b)>; - def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>; -} - // // Integer multiply-add // @@ -991,6 +902,39 @@ defm MAD32 : MAD<"mad.lo.s32", i32, B32, i32imm>; defm MAD64 : MAD<"mad.lo.s64", i64, B64, i64imm>; } +multiclass MAD_WIDE<string PtxSuffix, OneUse2 Op, RegTyInfo BigT, RegTyInfo SmallT> { + def rrr: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.RC:$b, BigT.RC:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), BigT.Ty:$c))]>; + def rri: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.RC:$b, BigT.Imm:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, SmallT.Ty:$b), imm:$c))]>; + def rir: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.RC:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), BigT.Ty:$c))]>; + def rii: + BasicNVPTXInst<(outs BigT.RC:$dst), + (ins SmallT.RC:$a, SmallT.Imm:$b, BigT.Imm:$c), + "mad.wide." # PtxSuffix, + [(set BigT.Ty:$dst, (add (Op SmallT.Ty:$a, imm:$b), imm:$c))]>; +} + +def mul_wide_unsigned_oneuse : OneUse2<mul_wide_unsigned>; +def mul_wide_signed_oneuse : OneUse2<mul_wide_signed>; + +let Predicates = [hasOptEnabled] in { +defm MAD_WIDE_U16 : MAD_WIDE<"u16", mul_wide_unsigned_oneuse, I32RT, I16RT>; +defm MAD_WIDE_S16 : MAD_WIDE<"s16", mul_wide_signed_oneuse, I32RT, I16RT>; +defm MAD_WIDE_U32 : MAD_WIDE<"u32", mul_wide_unsigned_oneuse, I64RT, I32RT>; +defm MAD_WIDE_S32 : MAD_WIDE<"s32", mul_wide_signed_oneuse, I64RT, I32RT>; +} + foreach t = [I16RT, I32RT, I64RT] in { def NEG_S # t.Size : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), @@ -1234,7 +1178,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>; defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>; defm FMA_F64 : FMA<F64RT, allow_ftz = false>; -// sin/cos +// sin/cos/tanh class UnaryOpAllowsApproxFn<SDPatternOperator operator> : PatFrag<(ops node:$A), @@ -1250,6 +1194,10 @@ def COS_APPROX_f32 : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz), "cos.approx$ftz.f32", [(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>; +def TANH_APPROX_f32 : + BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32", + [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>, + Requires<[hasPTX<70>, hasSM<75>]>; //----------------------------------- // Bitwise operations @@ -1512,20 +1460,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN // Byte extraction via shift/trunc/sext -def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), - (CVT_s8_s32 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), +def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>; + +def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>; + +def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), - (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), + (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; + def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))), (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), - (BFE_S64rii $s, imm:$o, 8)>; -def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), - (CVT_s8_s64 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), - (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; //----------------------------------- // Comparison instructions (setp, set) @@ -1709,56 +1656,39 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>; //----------------------------------- // Comparison and Selection //----------------------------------- +// TODO: These patterns seem very specific and brittle. We should try to find +// a more general solution. def cond_signed : PatLeaf<(cond), [{ return isSignedIntSetCC(N->get()); }]>; -def cond_not_signed : PatLeaf<(cond), [{ - return !isSignedIntSetCC(N->get()); -}]>; +// A 16-bit signed comparison of sign-extended byte extracts can be converted +// to 32-bit comparison if we change the PRMT to sign-extend the extracted +// bytes. +def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), + (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), + cond_signed:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE), + (PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE), + (cond2cc $cc))>; + +// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit +// comparison because we know that the truncate is just trancating off zeros +// and that the most-significant byte is also zeros so the meaning of signed and +// unsigned comparisons will not be changed. +def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), + (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), + cond:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; -// comparisons of i8 extracted with PRMT as i32 -// It's faster to do comparison directly on i32 extracted by PRMT, -// instead of the long conversion and sign extending. -def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)), - (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), - (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_not_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDTDeclareScalarParam : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; -def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; -def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; -def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>; def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; @@ -1770,104 +1700,20 @@ def declare_array_param : def declare_scalar_param : SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam, [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -def LoadParam : - SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV2 : - SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV4 : - SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def StoreParam : - SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV2 : - SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV4 : - SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; def proxy_reg : SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) def SDTCallProfile : SDTypeProfile<0, 6, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<5, i32>]>; -def call : - SDNode<"NVPTXISD::CALL", SDTCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -let mayLoad = true in { - class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), - []>; - - class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), - !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0$b];"), []>; - - class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, - regclass:$dst4), - (ins Offseti32imm:$b), - !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), - []>; -} - -let mayStore = true in { - - multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> { - foreach op = [IMMType, regclass] in - if !or(support_imm, !isa<NVPTXRegClass>(op)) then - def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") - : NVPTXInst<(outs), - (ins op:$val, i32imm:$a, Offseti32imm:$b), - "st.param" # opstr # " \t[param$a$b], $val;", - []>; - } - - multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, - i32imm:$a, Offseti32imm:$b), - "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", - []>; - } - - multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - foreach op3 = [IMMType, regclass] in - foreach op4 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - # !if(!isa<NVPTXRegClass>(op3), "r", "i") - # !if(!isa<NVPTXRegClass>(op4), "r", "i") - - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, - i32imm:$a, Offseti32imm:$b), - "st.param.v4" # opstr # - " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", - []>; - } -} +def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, -/// NumParams, Callee, Proto, InGlue) +/// NumParams, Callee, Proto) def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; } @@ -1904,43 +1750,6 @@ foreach is_convergent = [0, 1] in { (call_uni_inst $addr, imm:$rets, imm:$params)>; } -def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">; -def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">; -def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">; -def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">; -def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">; -def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">; -def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">; -def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">; -def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">; -def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">; -def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">; - -defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">; -defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">; -defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">; -defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">; - -defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>; -defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>; - -defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">; -defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">; -defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">; -defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">; - -defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">; -defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">; -defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">; - -defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">; -defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">; - -defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">; -defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">; - -defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">; - def DECLARE_PARAM_array : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size), ".param .align $align .b8 \t$a[$size];", []>; @@ -1953,6 +1762,18 @@ def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size), def : Pat<(declare_scalar_param externalsym:$a, imm:$size), (DECLARE_PARAM_scalar (to_texternsym $a), imm:$size)>; +// Call prototype wrapper, this is a dummy instruction that just prints it's +// operand which is string defining the prototype. +def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def CallPrototype : + SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def ProtoIdent : Operand<i32> { let PrintMethod = "printProtoIdent"; } +def CALL_PROTOTYPE : + NVPTXInst<(outs), (ins ProtoIdent:$ident), + "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; + + foreach t = [I32RT, I64RT] in { defvar inst_name = "MOV" # t.Size # "_PARAM"; def inst_name : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), "mov.b" # t.Size>; @@ -1972,6 +1793,32 @@ defm ProxyRegB16 : ProxyRegInst<"b16", B16>; defm ProxyRegB32 : ProxyRegInst<"b32", B32>; defm ProxyRegB64 : ProxyRegInst<"b64", B64>; + +// Callseq start and end + +// Note: these nodes are marked as SDNPMayStore and SDNPMayLoad because +// they define the scope in which the declared params may be used. Therefore +// we add these flags to ensure ld.param and st.param are not sunk or hoisted +// out of that scope. + +def callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPSideEffect, SDNPMayStore, SDNPMayLoad]>; + +def Callseq_Start : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\{ // callseq $amt1, $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; +def Callseq_End : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\} // callseq $amt1", + [(callseq_end timm:$amt1, timm:$amt2)]>; + // // Load / Store Handling // @@ -2515,26 +2362,6 @@ def : Pat<(brcond i32:$a, bb:$target), def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target), (CBranchOther $a, bb:$target)>; -// Call -def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, - SDTCisVT<1, i32>]>; -def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; - -def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPSideEffect]>; - -def Callseq_Start : - NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\{ // callseq $amt1, $amt2", - [(callseq_start timm:$amt1, timm:$amt2)]>; -def Callseq_End : - NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\} // callseq $amt1", - [(callseq_end timm:$amt1, timm:$amt2)]>; - // trap instruction def trapinst : BasicNVPTXInst<(outs), (ins), "trap", [(trap)]>, Requires<[noPTXASUnreachableBug]>; // Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. @@ -2543,18 +2370,6 @@ def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[ // brkpt instruction def debugtrapinst : BasicNVPTXInst<(outs), (ins), "brkpt", [(debugtrap)]>; -// Call prototype wrapper -def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def CallPrototype : - SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def ProtoIdent : Operand<i32> { - let PrintMethod = "printProtoIdent"; -} -def CALL_PROTOTYPE : - NVPTXInst<(outs), (ins ProtoIdent:$ident), - "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; - def SDTDynAllocaOp : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 5779d4e..0e8828f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -243,8 +243,6 @@ public: createObjectTargetWriter() const override { return createPPCXCOFFObjectWriter(TT.isArch64Bit()); } - - std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; }; } // end anonymous namespace @@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const { return std::nullopt; } -std::optional<MCFixupKind> -XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const { - return StringSwitch<std::optional<MCFixupKind>>(Name) - .Case("R_REF", PPC::fixup_ppc_nofixup) - .Default(std::nullopt); -} - MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index 9e8ee9f..df0c666 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -48,8 +48,7 @@ enum Fixups { /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the /// TLS general and local dynamic models, or inserts the thread-pointer - /// register number. It can also be used to tie the ref symbol to prevent it - /// from being garbage collected on AIX. + /// register number. fixup_ppc_nofixup, /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 8baf866..1af2f9c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -220,8 +220,6 @@ bool PPCELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, return evaluateAsRelocatable(Expr, Res, Asm); } -void PPCXCOFFMCAsmInfo::anchor() {} - PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle) report_fatal_error("XCOFF is not supported for little-endian targets"); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h index 0f945b3..6af1bd7 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -33,8 +33,6 @@ public: }; class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF { - void anchor() override; - public: explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &); void printSpecifierExpr(raw_ostream &OS, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 54497d9..3dad0e8 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -213,7 +213,7 @@ public: void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override { if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) { MCSymbolXCOFF *TCSym = - cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly()) + static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly()) ->getQualNameSymbol(); // On AIX, we have TLS variable offsets (symbol@({gd|ie|le|ld}) depending // on the TLS access method (or model). For the general-dynamic access diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index f75ab62..a04f404 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( switch ((unsigned)Fixup.getKind()) { default: report_fatal_error("Unimplemented fixup kind."); + case XCOFF::RelocationType::R_REF: + return {XCOFF::RelocationType::R_REF, 0}; case PPC::fixup_ppc_half16: { const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15; switch (Specifier) { @@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25}; case PPC::fixup_ppc_br24abs: return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25}; - case PPC::fixup_ppc_nofixup: { - if (Specifier == PPC::S_None) - return {XCOFF::RelocationType::R_REF, 0}; - else - llvm_unreachable("Unsupported Modifier"); - } break; case FK_Data_4: case FK_Data_8: const uint8_t SignAndSizeForFKData = diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index a091b21..ce1d51a 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2274,9 +2274,9 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV, void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) { // Setup CurrentFnDescSym and its containing csect. - MCSectionXCOFF *FnDescSec = - cast<MCSectionXCOFF>(getObjFileLowering().getSectionForFunctionDescriptor( - &MF.getFunction(), TM)); + auto *FnDescSec = static_cast<MCSectionXCOFF *>( + getObjFileLowering().getSectionForFunctionDescriptor(&MF.getFunction(), + TM)); FnDescSec->setAlignment(Align(Subtarget->isPPC64() ? 8 : 4)); CurrentFnDescSym = FnDescSec->getQualNameSymbol(); @@ -2669,9 +2669,9 @@ void PPCAIXAsmPrinter::emitTracebackTable() { MCSymbol *EHInfoSym = TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF); MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym, TOCType_EHBlock); - const MCSymbol *TOCBaseSym = - cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) - ->getQualNameSymbol(); + const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>( + getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); const MCExpr *Exp = MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx), MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx); @@ -2788,7 +2788,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) { } } - MCSectionXCOFF *Csect = cast<MCSectionXCOFF>( + auto *Csect = static_cast<MCSectionXCOFF *>( getObjFileLowering().SectionForGlobal(GV, GVKind, TM)); // Switch to the containing csect. @@ -2869,9 +2869,9 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() { OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext), PointerSize); // Emit TOC base address. - const MCSymbol *TOCBaseSym = - cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) - ->getQualNameSymbol(); + const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>( + getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); OutStreamer->emitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext), PointerSize); // Emit a null environment pointer. @@ -2996,10 +2996,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { Name += Prefix; Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName(); MCSymbol *S = OutContext.getOrCreateSymbol(Name); - TCEntry = cast<MCSectionXCOFF>( + TCEntry = static_cast<MCSectionXCOFF *>( getObjFileLowering().getSectionForTOCEntry(S, TM)); } else { - TCEntry = cast<MCSectionXCOFF>( + TCEntry = static_cast<MCSectionXCOFF *>( getObjFileLowering().getSectionForTOCEntry(I.first.first, TM)); } OutStreamer->switchSection(TCEntry); @@ -3054,7 +3054,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { return; SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM); - MCSectionXCOFF *Csect = cast<MCSectionXCOFF>( + auto *Csect = static_cast<MCSectionXCOFF *>( getObjFileLowering().SectionForGlobal(GO, GOKind, TM)); Align GOAlign = getGVAlignment(GO, GO->getDataLayout()); @@ -3316,9 +3316,9 @@ void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV, GlobalType = TOCType_GlobalExternal; MCSymbol *TypeInfoSym = TM.getSymbol(GV); MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym, GlobalType); - const MCSymbol *TOCBaseSym = - cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) - ->getQualNameSymbol(); + const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>( + getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); auto &Ctx = OutStreamer->getContext(); const MCExpr *Exp = MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx), diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index d295f35..1dc485d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2159,8 +2159,115 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; } -class XXEvalPattern <dag pattern, bits<8> imm> : - Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} +// ============================================================================= +// XXEVAL Instruction Pattern Definitions +// ============================================================================= +// +// XXEVAL instruction performs 256 different logical operations on three vector +// operands using an 8-bit immediate value to select the operation. +// Format: xxeval XT, XA, XB, XC, IMM +// For example: +// Equivalent function A?xor(B,C):and(B,C) is performed by +// xxeval XT, XA, XB, XC, 22 +// +// REGISTER CLASS CONSTRAINTS: +// - XXEVAL natively supports: VSRC register class [v4i32, v4f32, v2f64, v2i64] +// - Other vector types [v16i8, v8i16] require COPY_TO_REGCLASS to/from VRRC +// ============================================================================= + +class XXEvalPattern<dag pattern, bits<8> imm> + : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} + +class XXEvalPatterns<ValueType Vt, dag InputPattern, bits<8> Imm> + : Pat<(Vt InputPattern), + !if(!or(!eq(Vt, v4i32), !eq(Vt, v2i64)), + // VSRC path: direct XXEVAL for v4i32 and v2i64 + (XXEVAL $vA, $vB, $vC, Imm), + // VRRC path: wrap with COPY_TO_REGCLASS for other types + (COPY_TO_REGCLASS(XXEVAL(COPY_TO_REGCLASS Vt:$vA, VSRC), + (COPY_TO_REGCLASS Vt:$vB, VSRC), + (COPY_TO_REGCLASS Vt:$vC, VSRC), Imm), + VRRC))> {} + +// ============================================================================= +// PatFrags for Bitcast-Aware Vector bitwise Operations +// +// Each PatFrags defines TWO alternatives for pattern matcher to choose: +// - Direct operation (for v4i32) +// - Bitcast operation (for other types: v2i64, v16i8, v8i16) +// ============================================================================= + +// Basic Binary Operations +def VAnd + : PatFrags<(ops node:$a, node:$b), [(and node:$a, node:$b), + (bitconvert(and + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VXor + : PatFrags<(ops node:$a, node:$b), [(xor node:$a, node:$b), + (bitconvert(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VOr : PatFrags<(ops node:$a, node:$b), [(or node:$a, node:$b), + (bitconvert(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b))))]>; + +def VNot + : PatFrags<(ops node:$a), [(vnot node:$a), + (bitconvert(vnot(v4i32(bitconvert node:$a))))]>; + +// Derived bitwise operations +// Vector NOR operation (not(or)) +def VNor + : PatFrags<(ops node:$a, node:$b), [(vnot(or node:$a, node:$b)), + (bitconvert(vnot(or + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// Vector EQV operation (not(xor)) +def VEqv + : PatFrags<(ops node:$a, node:$b), [(vnot(xor node:$a, node:$b)), + (bitconvert(vnot(xor + (v4i32(bitconvert node:$a)), + (v4i32(bitconvert node:$b)))))]>; + +// ============================================================================= +// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectAnd +// This class matches the equivalent Ternary Operation: A ? f(B,C) : AND(B,C) +// and emit the corresponding xxeval instruction with the imm value. +// +// The patterns implement xxeval vector select operations where: +// - A is the selector vector +// - f(B,C) is the "true" case op on vectors B and C (XOR, NOR, EQV, or NOT) +// - AND(B,C) is the "false" case op on vectors B and C +// ============================================================================= +multiclass XXEvalTernarySelectAnd<ValueType Vt> { + // Pattern: A ? XOR(B,C) : AND(B,C) XXEVAL immediate value: 22 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 22>; + + // Pattern: A ? NOR(B,C) : AND(B,C) XXEVAL immediate value: 24 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 24>; + + // Pattern: A ? EQV(B,C) : AND(B,C) XXEVAL immediate value: 25 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), + 25>; + + // Pattern: A ? NOT(C) : AND(B,C) XXEVAL immediate value: 26 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VAnd Vt:$vB, Vt:$vC)), 26>; + + // Pattern: A ? NOT(B) : AND(B,C) XXEVAL immediate value: 28 + def : XXEvalPatterns< + Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VAnd Vt:$vB, Vt:$vC)), 28>; +} let Predicates = [PrefixInstrs, HasP10Vector] in { let AddedComplexity = 400 in { @@ -2270,6 +2377,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // (xor A, (or B, C)) def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; + // XXEval Patterns for ternary Operations. + foreach Ty = [v4i32, v2i64, v8i16, v16i8] in { + defm : XXEvalTernarySelectAnd<Ty>; + } + // Anonymous patterns to select prefixed VSX loads and stores. // Load / Store f128 def : Pat<(f128 (load PDForm:$src)), diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index a143d85..d71c42c 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -3849,9 +3849,14 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, switch (Inst.getOpcode()) { default: break; - case RISCV::PseudoC_ADDI_NOP: - emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP)); + case RISCV::PseudoC_ADDI_NOP: { + if (Inst.getOperand(2).getImm() == 0) + emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP)); + else + emitToStreamer( + Out, MCInstBuilder(RISCV::C_NOP_HINT).addOperand(Inst.getOperand(2))); return false; + } case RISCV::PseudoLLAImm: case RISCV::PseudoLAImm: case RISCV::PseudoLI: { diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index fa7bcfa..67cc01e 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -193,21 +193,19 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo, static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo, uint64_t Address, const MCDisassembler *Decoder) { - if (RegNo == 0) { + if (RegNo == 0) return MCDisassembler::Fail; - } return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); } -static DecodeStatus -DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, uint32_t Address, - const MCDisassembler *Decoder) { - if (RegNo == 2) { +static DecodeStatus DecodeGPRNoX2RegisterClass(MCInst &Inst, uint64_t RegNo, + uint32_t Address, + const MCDisassembler *Decoder) { + if (RegNo == 2) return MCDisassembler::Fail; - } - return DecodeGPRNoX0RegisterClass(Inst, RegNo, Address, Decoder); + return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); } static DecodeStatus DecodeGPRNoX31RegisterClass(MCInst &Inst, uint32_t RegNo, @@ -536,41 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address, return MCDisassembler::Success; } -static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus -decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); + if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, + const MCDisassembler *Decoder) { + if (Imm < RISCVZC::RA_S0) + return MCDisassembler::Fail; + return decodeZcmpRlist(Inst, Imm, Address, Decoder); +} + +static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, + uint64_t Address, const MCDisassembler *Decoder); static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, @@ -579,18 +562,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, #include "RISCVGenDisassemblerTables.inc" -static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - uint32_t Rd = fieldFromInstruction(Insn, 7, 5); - if (!Check(S, DecodeGPRNoX0RegisterClass(Inst, Rd, Address, Decoder))) - return MCDisassembler::Fail; - Inst.addOperand(Inst.getOperand(0)); - Inst.addOperand(MCOperand::createImm(0)); - return S; -} - static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -601,66 +572,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, return MCDisassembler::Success; } -static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - Inst.addOperand(MCOperand::createReg(RISCV::X0)); - uint32_t Imm = - fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); - [[maybe_unused]] DecodeStatus Result = - decodeSImmOperand<6>(Inst, Imm, Address, Decoder); - assert(Result == MCDisassembler::Success && "Invalid immediate"); - return MCDisassembler::Success; -} - -static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - Inst.addOperand(MCOperand::createReg(RISCV::X0)); - uint32_t Imm = - fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); - return decodeCLUIImmOperand(Inst, Imm, Address, Decoder); -} - -static DecodeStatus -decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - Inst.addOperand(MCOperand::createReg(RISCV::X0)); - Inst.addOperand(Inst.getOperand(0)); - - uint32_t UImm6 = - fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); - return decodeUImmLog2XLenNonZeroOperand(Inst, UImm6, Address, Decoder); -} - -static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - uint32_t Rd = fieldFromInstruction(Insn, 7, 5); - uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5); - if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder))) - return MCDisassembler::Fail; - if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder))) - return MCDisassembler::Fail; - return S; -} - -static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - uint32_t Rd = fieldFromInstruction(Insn, 7, 5); - uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5); - if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder))) - return MCDisassembler::Fail; - Inst.addOperand(Inst.getOperand(0)); - if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder))) - return MCDisassembler::Fail; - return S; -} - static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -691,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, return S; } -static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); - if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(Imm)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - if (Imm < RISCVZC::RA_S0) - return MCDisassembler::Fail; - return decodeZcmpRlist(Inst, Imm, Address, Decoder); -} - // Add implied SP operand for C.*SP compressed instructions. The SP operand // isn't explicitly encoded in the instruction. void RISCVDisassembler::addSPOperands(MCInst &MI) const { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 2c37c3b..82e3b5c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -320,6 +320,7 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup Fixup = MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN); F.setVarFixups({Fixup}); + F.setLinkerRelaxable(); F.getParent()->setLinkerRelaxable(); return true; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 7ad5d5f..bddea43 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -330,7 +330,6 @@ enum OperandType : unsigned { OPERAND_UIMM32, OPERAND_UIMM48, OPERAND_UIMM64, - OPERAND_ZERO, OPERAND_THREE, OPERAND_FOUR, OPERAND_SIMM5, diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index d4f5d8f..2f32e2a 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -293,7 +293,7 @@ void RISCVAsmPrinter::emitNTLHint(const MachineInstr *MI) { MCInst Hint; if (STI->hasStdExtZca()) - Hint.setOpcode(RISCV::C_ADD_HINT); + Hint.setOpcode(RISCV::C_ADD); else Hint.setOpcode(RISCV::ADD); diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td index 4c303a9..da6b95d 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.td +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td @@ -95,3 +95,7 @@ def CSR_XLEN_F32_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F32_V_Interrupt, // Same as CSR_XLEN_F64_V_Interrupt, but excluding X16-X31. def CSR_XLEN_F64_V_Interrupt_RVE: CalleeSavedRegs<(sub CSR_XLEN_F64_V_Interrupt, (sequence "X%u", 16, 31))>; + +def CSR_RT_MostRegs : CalleeSavedRegs<(sub CSR_Interrupt, X6, X7, X28)>; +def CSR_RT_MostRegs_RVE : CalleeSavedRegs<(sub CSR_RT_MostRegs, + (sequence "X%u", 16, 31))>; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index b1ab76a..9fc0d81 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1581,7 +1581,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, // Set the register and all its subregisters. if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) { SavedRegs.set(CSReg); - llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); }); + for (unsigned Reg : SubRegs) + SavedRegs.set(Reg); } // Combine to super register if all of its subregisters are marked. diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 34910b7..f223fdbe 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -634,7 +634,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 < C2 // -> (SignedBitfieldExtract X, msb, lsb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -750,7 +750,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { // Transform (sra (shl X, C1) C2) with C1 > C2 // -> (NDS.BFOS X, lsb, msb) if (N0.getOpcode() == ISD::SHL) { - auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + auto *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!N01C) return false; @@ -1191,7 +1191,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Optimize (shl (and X, C2), C) -> (slli (srliw X, C3), C3+C) // where C2 has 32 leading zeros and C3 trailing zeros. SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1210,7 +1210,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // - without Zba a tablegen pattern applies the very same // transform as we would have done here SDNode *SLLI = CurDAG->getMachineNode( - RISCV::SLLI, DL, VT, N0->getOperand(0), + RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LeadingZeros, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1239,7 +1239,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { unsigned TrailingZeros = llvm::countr_zero(Mask); if (LeadingZeros == 32 && TrailingZeros > ShAmt) { SDNode *SRLIW = CurDAG->getMachineNode( - RISCV::SRLIW, DL, VT, N0->getOperand(0), + RISCV::SRLIW, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(TrailingZeros, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), @@ -1266,7 +1266,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (TrailingOnes == 32) { SDNode *SRLI = CurDAG->getMachineNode( Subtarget->is64Bit() ? RISCV::SRLIW : RISCV::SRLI, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -1279,19 +1279,19 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (HasBitTest && ShAmt + 1 == TrailingOnes) { SDNode *BEXTI = CurDAG->getMachineNode( Subtarget->hasStdExtZbs() ? RISCV::BEXTI : RISCV::TH_TST, DL, VT, - N0->getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); + N0.getOperand(0), CurDAG->getTargetConstant(ShAmt, DL, VT)); ReplaceNode(Node, BEXTI); return; } const unsigned Msb = TrailingOnes - 1; const unsigned Lsb = ShAmt; - if (tryUnsignedBitfieldExtract(Node, DL, VT, N0->getOperand(0), Msb, Lsb)) + if (tryUnsignedBitfieldExtract(Node, DL, VT, N0.getOperand(0), Msb, Lsb)) return; unsigned LShAmt = Subtarget->getXLen() - TrailingOnes; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRLI = CurDAG->getMachineNode( RISCV::SRLI, DL, VT, SDValue(SLLI, 0), @@ -1328,7 +1328,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; unsigned LShAmt = Subtarget->getXLen() - ExtSize; SDNode *SLLI = - CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), CurDAG->getTargetConstant(LShAmt, DL, VT)); SDNode *SRAI = CurDAG->getMachineNode( RISCV::SRAI, DL, VT, SDValue(SLLI, 0), @@ -2942,8 +2942,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, /// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9. bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset) { - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (SelectAddrFrameIndex(Addr, Base, Offset)) + return true; SDLoc DL(Addr); MVT VT = Addr.getSimpleValueType(); @@ -2953,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, if (isUInt<9>(CVal)) { Base = Addr.getOperand(0); - // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only - // a 9-bit immediate can be folded. + if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base)) + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT); Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 3918dd2..c0ada51 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1618,6 +1618,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } } + // Customize load and store operation for bf16 if zfh isn't enabled. + if (Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh()) { + setOperationAction(ISD::LOAD, MVT::bf16, Custom); + setOperationAction(ISD::STORE, MVT::bf16, Custom); + } + // Function alignments. const Align FunctionAlignment(Subtarget.hasStdExtZca() ? 2 : 4); setMinFunctionAlignment(FunctionAlignment); @@ -7216,6 +7222,47 @@ static SDValue SplitStrictFPVectorOp(SDValue Op, SelectionDAG &DAG) { return DAG.getMergeValues({V, HiRes.getValue(1)}, DL); } +SDValue +RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Load(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() && + "Unexpected bfloat16 load lowering"); + + SDLoc DL(Op); + LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); + EVT MemVT = LD->getMemoryVT(); + SDValue Load = DAG.getExtLoad( + ISD::ZEXTLOAD, DL, Subtarget.getXLenVT(), LD->getChain(), + LD->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + LD->getMemOperand()); + // Using mask to make bf16 nan-boxing valid when we don't have flh + // instruction. -65536 would be treat as a small number and thus it can be + // directly used lui to get the constant. + SDValue mask = DAG.getSignedConstant(-65536, DL, Subtarget.getXLenVT()); + SDValue OrSixteenOne = + DAG.getNode(ISD::OR, DL, Load.getValueType(), {Load, mask}); + SDValue ConvertedResult = + DAG.getNode(RISCVISD::NDS_FMV_BF16_X, DL, MVT::bf16, OrSixteenOne); + return DAG.getMergeValues({ConvertedResult, Load.getValue(1)}, DL); +} + +SDValue +RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Store(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() && + "Unexpected bfloat16 store lowering"); + + StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); + SDLoc DL(Op); + SDValue FMV = DAG.getNode(RISCVISD::NDS_FMV_X_ANYEXTBF16, DL, + Subtarget.getXLenVT(), ST->getValue()); + return DAG.getTruncStore( + ST->getChain(), DL, FMV, ST->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), ST->getMemoryVT().getSizeInBits()), + ST->getMemOperand()); +} + SDValue RISCVTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -7914,6 +7961,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getMergeValues({Pair, Chain}, DL); } + if (VT == MVT::bf16) + return lowerXAndesBfHCvtBFloat16Load(Op, DAG); + // Handle normal vector tuple load. if (VT.isRISCVVectorTuple()) { SDLoc DL(Op); @@ -7998,6 +8048,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, {Store->getChain(), Lo, Hi, Store->getBasePtr()}, MVT::i64, Store->getMemOperand()); } + + if (VT == MVT::bf16) + return lowerXAndesBfHCvtBFloat16Store(Op, DAG); + // Handle normal vector tuple store. if (VT.isRISCVVectorTuple()) { SDLoc DL(Op); @@ -16079,7 +16133,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt = CNode->getZExtValue(); // Don't do this if the Xqciac extension is enabled and the MulAmt in simm12. - if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt)) + if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); const bool HasShlAdd = Subtarget.hasStdExtZba() || @@ -16184,10 +16238,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x)) for (uint64_t Offset : {3, 5, 9}) { if (isPowerOf2_64(MulAmt + Offset)) { + unsigned ShAmt = Log2_64(MulAmt + Offset); + if (ShAmt >= VT.getSizeInBits()) + continue; SDLoc DL(N); SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT)); + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT)); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); @@ -20674,6 +20730,53 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getAllOnesConstant(DL, VT); return DAG.getConstant(0, DL, VT); } + case Intrinsic::riscv_vsseg2_mask: + case Intrinsic::riscv_vsseg3_mask: + case Intrinsic::riscv_vsseg4_mask: + case Intrinsic::riscv_vsseg5_mask: + case Intrinsic::riscv_vsseg6_mask: + case Intrinsic::riscv_vsseg7_mask: + case Intrinsic::riscv_vsseg8_mask: { + SDValue Tuple = N->getOperand(2); + unsigned NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + + if (Subtarget.hasOptimizedSegmentLoadStore(NF) || !Tuple.hasOneUse() || + Tuple.getOpcode() != RISCVISD::TUPLE_INSERT || + !Tuple.getOperand(0).isUndef()) + return SDValue(); + + SDValue Val = Tuple.getOperand(1); + unsigned Idx = Tuple.getConstantOperandVal(2); + + unsigned SEW = Val.getValueType().getScalarSizeInBits(); + assert(Log2_64(SEW) == N->getConstantOperandVal(6) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/N->getOperand(0), + /*IntID=*/ + DAG.getTargetConstant(Intrinsic::riscv_vsse_mask, DL, XLenVT), + /*StoredVal=*/Val, + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, N->getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/N->getOperand(4), + /*VL=*/N->getOperand(5)}; + + auto *OldMemSD = cast<MemIntrinsicSDNode>(N); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = OldMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + OldMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList(MVT::Other); + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, VTs, Ops, MemVT, + MMO); + } } } case ISD::EXPERIMENTAL_VP_REVERSE: @@ -20766,6 +20869,68 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } break; } + case RISCVISD::TUPLE_EXTRACT: { + EVT VT = N->getValueType(0); + SDValue Tuple = N->getOperand(0); + unsigned Idx = N->getConstantOperandVal(1); + if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN) + break; + + unsigned NF = 0; + switch (Tuple.getConstantOperandVal(1)) { + default: + break; + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + NF = Tuple.getValueType().getRISCVVectorTupleNumFields(); + break; + } + + if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF)) + break; + + unsigned SEW = VT.getScalarSizeInBits(); + assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) && + "Type mismatch without bitcast?"); + unsigned Stride = SEW / 8 * NF; + unsigned Offset = SEW / 8 * Idx; + + SDValue Ops[] = { + /*Chain=*/Tuple.getOperand(0), + /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT), + /*Passthru=*/Tuple.getOperand(2), + /*Ptr=*/ + DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3), + DAG.getConstant(Offset, DL, XLenVT)), + /*Stride=*/DAG.getConstant(Stride, DL, XLenVT), + /*Mask=*/Tuple.getOperand(4), + /*VL=*/Tuple.getOperand(5), + /*Policy=*/Tuple.getOperand(6)}; + + auto *TupleMemSD = cast<MemIntrinsicSDNode>(Tuple); + // Match getTgtMemIntrinsic for non-unit stride case + EVT MemVT = TupleMemSD->getMemoryVT().getScalarType(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize); + + SDVTList VTs = DAG.getVTList({VT, MVT::Other}); + SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, + Ops, MemVT, MMO); + DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1)); + return Result.getValue(0); + } + case RISCVISD::TUPLE_INSERT: { + // tuple_insert tuple, undef, idx -> tuple + if (N->getOperand(1).isUndef()) + return N->getOperand(0); + break; + } } return SDValue(); @@ -22290,6 +22455,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( case CallingConv::C: case CallingConv::Fast: case CallingConv::SPIR_KERNEL: + case CallingConv::PreserveMost: case CallingConv::GRAAL: case CallingConv::RISCV_VectorCall: #define CC_VLS_CASE(ABI_VLEN) case CallingConv::RISCV_VLSCall_##ABI_VLEN: @@ -22559,8 +22725,14 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT XLenVT = Subtarget.getXLenVT(); + const CallBase *CB = CLI.CB; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; + + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); // Analyze the operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; @@ -22818,6 +22990,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, if (CLI.CFIType) Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); + if (MF.getTarget().Options.EmitCallGraphSection && CB && + CB->isIndirectCall()) + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); return Ret; } @@ -22825,6 +23000,10 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); if (CLI.CFIType) Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); + + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); Glue = Chain.getValue(1); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index f0447e0..ca70c46 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -578,6 +578,9 @@ private: SDValue lowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerXAndesBfHCvtBFloat16Load(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerXAndesBfHCvtBFloat16Store(SDValue Op, SelectionDAG &DAG) const; + bool isEligibleForTailCallOptimization( CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, const SmallVector<CCValAssign, 16> &ArgLocs) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 64f9e3e..085064e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2859,9 +2859,6 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM16_NONZERO: Ok = isUInt<16>(Imm) && (Imm != 0); break; - case RISCVOp::OPERAND_ZERO: - Ok = Imm == 0; - break; case RISCVOp::OPERAND_THREE: Ok = Imm == 3; break; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 8252a9b..c5551fb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -57,12 +57,6 @@ def simm6nonzero : RISCVOp, }]; } -def immzero : RISCVOp, - ImmLeaf<XLenVT, [{return (Imm == 0);}]> { - let ParserMatchClass = ImmZeroAsmOperand; - let OperandType = "OPERAND_ZERO"; -} - def CLUIImmAsmOperand : AsmOperandClass { let Name = "CLUIImm"; let RenderMethod = "addImmOperands"; @@ -272,7 +266,7 @@ class Bcz<bits<3> funct3, string OpcodeStr> let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class Shift_right<bits<2> funct2, string OpcodeStr> : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), - (ins GPRC:$rs1, uimmlog2xlennonzero:$imm), + (ins GPRC:$rs1, uimmlog2xlen:$imm), OpcodeStr, "$rs1, $imm"> { let Constraints = "$rs1 = $rd"; let Inst{12} = imm{5}; @@ -402,17 +396,19 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">, let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), - (ins GPRNoX0:$rd, simm6nonzero:$imm), + (ins GPRNoX0:$rd, simm6:$imm), "c.addi", "$rd, $imm">, Sched<[WriteIALU, ReadIALU]> { let Constraints = "$rd = $rd_wb"; } -// Alternate syntax for c.nop. Converted to C_NOP by the assembler. +// Alternate syntax for c.nop. Converted to C_NOP/C_NOP_HINT by the assembler. let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, isAsmParserOnly = 1 in -def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm), - [], "c.addi", "$rd, $imm">; +def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, simm6:$imm), + [], "c.addi", "$rd, $imm"> { + let Constraints = "$rs1 = $rd"; +} let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1, DecoderNamespace = "RV32Only", Defs = [X1], @@ -430,7 +426,7 @@ def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb), } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm), +def C_LI : RVInst16CI<0b010, 0b01, (outs GPR:$rd), (ins simm6:$imm), "c.li", "$rd, $imm">, Sched<[WriteIALU]>; @@ -449,7 +445,7 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb), } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd), +def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX2:$rd), (ins c_lui_imm:$imm), "c.lui", "$rd, $imm">, Sched<[WriteIALU]>; @@ -497,8 +493,8 @@ def C_BEQZ : Bcz<0b110, "c.beqz">, Sched<[WriteJmp, ReadJmp]>; def C_BNEZ : Bcz<0b111, "c.bnez">, Sched<[WriteJmp, ReadJmp]>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb), - (ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm), +def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), + (ins GPR:$rd, uimmlog2xlen:$imm), "c.slli", "$rd, $imm">, Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; @@ -544,7 +540,7 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1), let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1, isAsCheapAsAMove = 1 in -def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2), +def C_MV : RVInst16CR<0b1000, 0b10, (outs GPR:$rs1), (ins GPRNoX0:$rs2), "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]>; @@ -557,8 +553,8 @@ def C_JALR : RVInst16CR<0b1001, 0b10, (outs), (ins GPRNoX0:$rs1), "c.jalr", "$rs1">, Sched<[WriteJalr, ReadJalr]>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rd), - (ins GPRNoX0:$rs1, GPRNoX0:$rs2), +def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd), + (ins GPR:$rs1, GPRNoX0:$rs2), "c.add", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU, ReadIALU]> { let Constraints = "$rs1 = $rd"; @@ -616,81 +612,6 @@ def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm), let rd = 0; } -def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), - (ins GPRNoX0:$rd, immzero:$imm), - "c.addi", "$rd, $imm">, - Sched<[WriteIALU, ReadIALU]> { - let Constraints = "$rd = $rd_wb"; - let imm = 0; - let DecoderMethod = "decodeRVCInstrRdRs1ImmZero"; -} - -def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm), - "c.li", "$rd, $imm">, - Sched<[WriteIALU]> { - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdSImm6"; -} - -def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd), - (ins c_lui_imm:$imm), - "c.lui", "$rd, $imm">, - Sched<[WriteIALU]> { - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdCLUIImm"; -} - -def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2), - "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]> { - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdRs2"; -} - -def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rd), - (ins GPRX0:$rs1, GPRNoX0:$rs2), - "c.add", "$rs1, $rs2">, - Sched<[WriteIALU, ReadIALU, ReadIALU]> { - let Constraints = "$rs1 = $rd"; - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdRs1Rs2"; -} - -def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb), - (ins GPRX0:$rd, uimmlog2xlennonzero:$imm), - "c.slli", "$rd, $imm">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rd = $rd_wb"; - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdRs1UImmLog2XLenNonZero"; -} - -def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd), - "c.slli64", "$rd">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rd = $rd_wb"; - let imm = 0; -} - -def C_SRLI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), - (ins GPRC:$rs1), - "c.srli64", "$rs1">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rs1 = $rd"; - let Inst{6-2} = 0; - let Inst{11-10} = 0b00; - let Inst{12} = 0; -} - -def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), - (ins GPRC:$rs1), - "c.srai64", "$rs1">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rs1 = $rd"; - let Inst{6-2} = 0; - let Inst{11-10} = 0b01; - let Inst{12} = 0; -} - } // Predicates = [HasStdExtZca], hasSideEffects = 0, mayLoad = 0, // mayStore = 0 @@ -699,15 +620,17 @@ def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZca] in { -// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6. -def : InstAlias<"c.addi x0, $imm", (C_NOP_HINT simm6nonzero:$imm), 0>; +// Legacy aliases. +def : InstAlias<"c.slli64 $rd", (C_SLLI GPR:$rd, 0), 0>; +def : InstAlias<"c.srli64 $rs1", (C_SRLI GPRC:$rs1, 0), 0>; +def : InstAlias<"c.srai64 $rs1", (C_SRAI GPRC:$rs1, 0), 0>; } let Predicates = [HasStdExtC, HasStdExtZihintntl] in { -def : InstAlias<"c.ntl.p1", (C_ADD_HINT X0, X2)>; -def : InstAlias<"c.ntl.pall", (C_ADD_HINT X0, X3)>; -def : InstAlias<"c.ntl.s1", (C_ADD_HINT X0, X4)>; -def : InstAlias<"c.ntl.all", (C_ADD_HINT X0, X5)>; +def : InstAlias<"c.ntl.p1", (C_ADD X0, X2)>; +def : InstAlias<"c.ntl.pall", (C_ADD X0, X3)>; +def : InstAlias<"c.ntl.s1", (C_ADD X0, X4)>; +def : InstAlias<"c.ntl.all", (C_ADD X0, X5)>; } // Predicates = [HasStdExtC, HasStdExtZihintntl] let EmitPriority = 0 in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index dd365cf..8297d50 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> //===----------------------------------------------------------------------===// let Predicates = [HasStdExtP] in { +let IsSignExtendingOpW = 1 in def CLS : Unary_r<0b011000000011, 0b001, "cls">; def ABS : Unary_r<0b011000000111, 0b001, "abs">; } // Predicates = [HasStdExtP] @@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in { def REV16 : Unary_r<0b011010110000, 0b101, "rev16">; def REV_RV64 : Unary_r<0b011010111111, 0b101, "rev">; +let IsSignExtendingOpW = 1 in { def CLSW : UnaryW_r<0b011000000011, 0b001, "clsw">; def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; +} } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index dfa532a..03e6f43 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -788,7 +788,7 @@ class VPseudoUSLoadNoMask<VReg RetClass, DAGOperand sewop = sew> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, - sewop:$sew, vec_policy:$policy), []>, + sewop:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -804,7 +804,7 @@ class VPseudoUSLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -821,7 +821,7 @@ class VPseudoUSLoadFFNoMask<VReg RetClass, int EEW> : RISCVVPseudo<(outs RetClass:$rd, GPR:$vl), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -837,7 +837,7 @@ class VPseudoUSLoadFFMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -854,7 +854,7 @@ class VPseudoSLoadNoMask<VReg RetClass, int EEW> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -870,7 +870,7 @@ class VPseudoSLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -892,7 +892,7 @@ class VPseudoILoadNoMask<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -914,7 +914,7 @@ class VPseudoILoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$rs2, VMaskOp:$vm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -933,7 +933,7 @@ class VPseudoUSStoreNoMask<VReg StClass, DAGOperand sewop = sew> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, - sewop:$sew), []>, + sewop:$sew)>, RISCVVSE</*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -946,7 +946,7 @@ class VPseudoUSStoreMask<VReg StClass, int EEW> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSE</*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -960,7 +960,7 @@ class VPseudoSStoreNoMask<VReg StClass, int EEW> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, - AVL:$vl, sew:$sew), []>, + AVL:$vl, sew:$sew)>, RISCVVSE</*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -973,7 +973,7 @@ class VPseudoSStoreMask<VReg StClass, int EEW> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSE</*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -986,7 +986,7 @@ class VPseudoSStoreMask<VReg StClass, class VPseudoNullaryNoMask<VReg RegClass> : RISCVVPseudo<(outs RegClass:$rd), (ins RegClass:$passthru, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1015,7 +1015,7 @@ class VPseudoNullaryMask<VReg RegClass> : // Nullary for pseudo instructions. They are expanded in // RISCVExpandPseudoInsts pass. class VPseudoNullaryPseudoM<string BaseInst> : - RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []> { + RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1031,7 +1031,7 @@ class VPseudoUnaryNoMask<DAGOperand RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, OpClass:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1047,7 +1047,7 @@ class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass, string Constraint = "", bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), - (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []> { + (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1063,7 +1063,7 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1084,7 +1084,7 @@ class VPseudoUnaryMask<VReg RetClass, DAGOperand sewop = sew> : RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, - VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []> { + VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1104,7 +1104,7 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, VMaskOp:$vm, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1139,7 +1139,7 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass, class VPseudoUnaryNoMaskGPROut : RISCVVPseudo<(outs GPR:$rd), - (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []> { + (ins VR:$rs2, AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1149,7 +1149,7 @@ class VPseudoUnaryNoMaskGPROut : class VPseudoUnaryMaskGPROut : RISCVVPseudo<(outs GPR:$rd), - (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []> { + (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1163,7 +1163,7 @@ class VPseudoUnaryAnyMask<VReg RetClass, VReg Op1Class> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, - VR:$vm, AVL:$vl, sew:$sew), []> { + VR:$vm, AVL:$vl, sew:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1197,7 +1197,7 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1241,7 +1241,7 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass, (ins GetVRegNoV0<RetClass>.R:$passthru, Op1Class:$rs2, Op2Class:$rs1, VMaskOp:$vm, vec_rm:$rm, AVL:$vl, - sew:$sew, vec_policy:$policy), []> { + sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1266,7 +1266,7 @@ class VPseudoTiedBinaryNoMask<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew, - vec_policy:$policy), []> { + vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1288,7 +1288,7 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass, (ins RetClass:$rs2, Op2Class:$rs1, vec_rm:$rm, AVL:$vl, sew:$sew, - vec_policy:$policy), []> { + vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1380,7 +1380,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass, Op1Class:$rs2, Op2Class:$rs1, VMaskOp:$vm, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1451,7 +1451,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass, Op2Class:$rs1, VMaskOp:$vm, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1480,7 +1480,7 @@ class VPseudoBinaryCarry<VReg RetClass, (ins Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, AVL:$vl, sew:$sew), (ins Op1Class:$rs2, Op2Class:$rs1, - AVL:$vl, sew:$sew)), []> { + AVL:$vl, sew:$sew))> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1498,7 +1498,7 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, - VMV0:$carry, AVL:$vl, sew:$sew), []> { + VMV0:$carry, AVL:$vl, sew:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1510,21 +1510,6 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass, let VLMul = MInfo.value; } -class VPseudoTernaryNoMask<VReg RetClass, - RegisterClass Op1Class, - DAGOperand Op2Class, - string Constraint> : - RISCVVPseudo<(outs RetClass:$rd), - (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - AVL:$vl, sew:$sew), []> { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = !interleave([Constraint, "$rd = $rs3"], ","); - let HasVLOp = 1; - let HasSEWOp = 1; -} - class VPseudoTernaryNoMaskWithPolicy<VReg RetClass, RegisterClass Op1Class, DAGOperand Op2Class, @@ -1532,7 +1517,7 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1570,7 +1555,7 @@ class VPseudoUSSegLoadNoMask<VReg RetClass, bits<4> NF> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1587,7 +1572,7 @@ class VPseudoUSSegLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1605,7 +1590,7 @@ class VPseudoUSSegLoadFFNoMask<VReg RetClass, bits<4> NF> : RISCVVPseudo<(outs RetClass:$rd, GPR:$vl), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1622,7 +1607,7 @@ class VPseudoUSSegLoadFFMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1640,7 +1625,7 @@ class VPseudoSSegLoadNoMask<VReg RetClass, bits<4> NF> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1657,7 +1642,7 @@ class VPseudoSSegLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, VMaskOp:$vm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1679,7 +1664,7 @@ class VPseudoISegLoadNoMask<VReg RetClass, RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$offset, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -1701,7 +1686,7 @@ class VPseudoISegLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$offset, VMaskOp:$vm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -1735,7 +1720,7 @@ class VPseudoUSSegStoreMask<VReg ValClass, bits<4> NF> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1750,7 +1735,7 @@ class VPseudoSSegStoreNoMask<VReg ValClass, bits<4> NF> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset, - AVL:$vl, sew:$sew), []>, + AVL:$vl, sew:$sew)>, RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1764,7 +1749,7 @@ class VPseudoSSegStoreMask<VReg ValClass, bits<4> NF> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1782,7 +1767,7 @@ class VPseudoISegStoreNoMask<VReg ValClass, bit Ordered> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, - AVL:$vl, sew:$sew), []>, + AVL:$vl, sew:$sew)>, RISCVVSXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -1799,7 +1784,7 @@ class VPseudoISegStoreMask<VReg ValClass, bit Ordered> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -6703,7 +6688,7 @@ let Predicates = [HasVInstructions] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { let HasSEWOp = 1, BaseInstr = VMV_X_S in def PseudoVMV_X_S: - RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>, + RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>, Sched<[WriteVMovXS, ReadVMovXS]>; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1, Constraints = "$rd = $passthru" in @@ -6723,8 +6708,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { foreach f = FPList in { let HasSEWOp = 1, BaseInstr = VFMV_F_S in def "PseudoVFMV_" # f.FX # "_S" : - RISCVVPseudo<(outs f.fprclass:$rd), - (ins VR:$rs2, sew:$sew), []>, + RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>, Sched<[WriteVMovFS, ReadVMovFS]>; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1, Constraints = "$rd = $passthru" in diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index 1bb67f4..c75addd9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -11,6 +11,20 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// RISC-V specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDT_NDS_FMV_BF16_X + : SDTypeProfile<1, 1, [SDTCisVT<0, bf16>, SDTCisVT<1, XLenVT>]>; +def SDT_NDS_FMV_X_ANYEXTBF16 + : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, bf16>]>; + +def riscv_nds_fmv_bf16_x + : SDNode<"RISCVISD::NDS_FMV_BF16_X", SDT_NDS_FMV_BF16_X>; +def riscv_nds_fmv_x_anyextbf16 + : SDNode<"RISCVISD::NDS_FMV_X_ANYEXTBF16", SDT_NDS_FMV_X_ANYEXTBF16>; + +//===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// @@ -773,6 +787,25 @@ def : Pat<(bf16 (fpround FPR32:$rs)), (NDS_FCVT_BF16_S FPR32:$rs)>; } // Predicates = [HasVendorXAndesBFHCvt] +let isCodeGenOnly = 1 in { +def NDS_FMV_BF16_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR16, GPR, "fmv.w.x">, + Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>; +def NDS_FMV_X_BF16 : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR16, "fmv.x.w">, + Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>; +} + +let Predicates = [HasVendorXAndesBFHCvt] in { +def : Pat<(riscv_nds_fmv_bf16_x GPR:$src), (NDS_FMV_BF16_X GPR:$src)>; +def : Pat<(riscv_nds_fmv_x_anyextbf16 (bf16 FPR16:$src)), + (NDS_FMV_X_BF16 (bf16 FPR16:$src))>; +} // Predicates = [HasVendorXAndesBFHCvt] + +// Use flh/fsh to load/store bf16 if zfh is enabled. +let Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt] in { +def : LdPat<load, FLH, bf16>; +def : StPat<store, FSH, FPR16, bf16>; +} // Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt] + let Predicates = [HasVendorXAndesVBFHCvt] in { defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16; defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index f391300..5265613 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1120,27 +1120,11 @@ let Predicates = [HasVendorXqcisync, IsRV32] in { def QC_C_SYNCWF : QCIRVInst16CBSYNC<0b100, "qc.c.syncwf">; def QC_C_SYNCWL : QCIRVInst16CBSYNC<0b101, "qc.c.syncwl">; - let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in - def QC_C_DELAY : RVInst16CI<0b000, 0b10, (outs), - (ins uimm5nonzero:$imm), - "qc.c.delay", "$imm"> { - let Inst{12} = 0; - let Inst{11-7} = 0; - let Inst{6-2} = imm{4-0}; - } + // qc.c.delay implemented as an alias, below } // Predicates = [HasVendorXqcisync, IsRV32] let Predicates = [HasVendorXqcisim, IsRV32] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { - def QC_PSYSCALLI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm10:$imm10), - "qc.psyscalli", "$imm10"> { - bits<10> imm10; - - let rs1 = 0; - let rd = 0; - let imm12 = {0b00, imm10}; - } - def QC_PPUTCI : RVInstI<0b010, OPC_OP_IMM, (outs), (ins uimm8:$imm8), "qc.pputci", "$imm8"> { bits<8> imm8; @@ -1150,18 +1134,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { let imm12 = {0b0100, imm8}; } - def QC_PCOREDUMP : QCISim_NONE<0b0110, "qc.pcoredump">; - def QC_PPREGS : QCISim_NONE<0b0111, "qc.ppregs">; - def QC_PPREG : QCISim_RS1<0b1000, "qc.ppreg">; - def QC_PPUTC : QCISim_RS1<0b1001, "qc.pputc">; - def QC_PPUTS : QCISim_RS1<0b1010, "qc.pputs">; - def QC_PEXIT : QCISim_RS1<0b1011, "qc.pexit">; - def QC_PSYSCALL : QCISim_RS1<0b1100, "qc.psyscall">; - - def QC_C_PTRACE : RVInst16CI<0b000, 0b10, (outs), (ins), "qc.c.ptrace", ""> { - let rd = 0; - let imm = 0; - } + // The other instructions are all implemented as aliases, below } // mayLoad = 0, mayStore = 0, hasSideEffects = 1 } // Predicates = [HasVendorXqcisim, IsRV32] @@ -1218,6 +1191,27 @@ let EmitPriority = 0 in { } // EmitPriority = 0 } // Predicates = [HasVendorXqcilo, IsRV32] +let Predicates = [HasVendorXqcisim, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.ptrace", (C_SLLI X0, 0)>; + + def : InstAlias<"qc.psyscalli $imm", (SLTI X0, X0, uimm10:$imm)>; + def : InstAlias<"qc.pcoredump", (SLTI X0, X0, 1536)>; + def : InstAlias<"qc.ppregs", (SLTI X0, X0, 1792)>; + def : InstAlias<"qc.ppreg $rs1", (SLTI X0, GPR:$rs1, -2048)>; + def : InstAlias<"qc.pputc $rs1", (SLTI X0, GPR:$rs1, -1792)>; + def : InstAlias<"qc.pputs $rs1", (SLTI X0, GPR:$rs1, -1536)>; + def : InstAlias<"qc.pexit $rs1", (SLTI X0, GPR:$rs1, -1280)>; + def : InstAlias<"qc.psyscall $rs1", (SLTI X0, GPR:$rs1, -1024)>; +} // EmitPriority = 1 +} // Predicates = [HasVendorXqcisim, IsRV32] + +let Predicates = [HasVendorXqcisync, IsRV32] in { +let EmitPriority = 1 in { + def : InstAlias<"qc.c.delay $imm", (C_SLLI X0, uimm5nonzero:$imm)>; +} +} // Predicates = [HasVendorXqcisync, IsRV32] + //===----------------------------------------------------------------------===// // Pseudo-instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td index f173440..ed1a60a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td @@ -291,31 +291,31 @@ def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs2, GPRC:$rs1), let Predicates = [HasStdExtZcb, HasStdExtZbb] in{ def : CompressPat<(SEXT_B GPRC:$rs1, GPRC:$rs1), - (C_SEXT_B GPRC:$rs1, GPRC:$rs1)>; + (C_SEXT_B GPRC:$rs1)>; def : CompressPat<(SEXT_H GPRC:$rs1, GPRC:$rs1), - (C_SEXT_H GPRC:$rs1, GPRC:$rs1)>; + (C_SEXT_H GPRC:$rs1)>; } // Predicates = [HasStdExtZcb, HasStdExtZbb] let Predicates = [HasStdExtZcb, HasStdExtZbb] in{ def : CompressPat<(ZEXT_H_RV32 GPRC:$rs1, GPRC:$rs1), - (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_H GPRC:$rs1)>; def : CompressPat<(ZEXT_H_RV64 GPRC:$rs1, GPRC:$rs1), - (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_H GPRC:$rs1)>; } // Predicates = [HasStdExtZcb, HasStdExtZbb] let Predicates = [HasStdExtZcb] in{ def : CompressPat<(ANDI GPRC:$rs1, GPRC:$rs1, 255), - (C_ZEXT_B GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_B GPRC:$rs1)>; } // Predicates = [HasStdExtZcb] let Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64] in{ def : CompressPat<(ADD_UW GPRC:$rs1, GPRC:$rs1, X0), - (C_ZEXT_W GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_W GPRC:$rs1)>; } // Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64] let Predicates = [HasStdExtZcb] in{ def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1), - (C_NOT GPRC:$rs1, GPRC:$rs1)>; + (C_NOT GPRC:$rs1)>; } let Predicates = [HasStdExtZcb] in{ diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 0565fcd..726920e 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - // If the segment load is going to be performed segment at a time anyways - // and there's only one element used, use a strided load instead. This - // will be equally fast, and create less vector register pressure. - if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { - unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // Note: Same VL as above, but i32 not xlen due to signature of - // vp.strided.load - VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, - {VTy, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, Mask, VL}); - Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes); - CI->addParamAttr(0, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - Shuffles[0]->replaceAllUsesWith(CI); - return true; - }; - CallInst *VlsegN = Builder.CreateIntrinsic( FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); @@ -289,34 +266,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) return false; - unsigned Index; - // If the segment store only has one active lane (i.e. the interleave is - // just a spread shuffle), we can use a strided store instead. This will - // be equally fast, and create less vector register pressure. - if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) && - isSpreadMask(Mask, Factor, Index)) { - unsigned ScalarSizeInBytes = - DL.getTypeStoreSize(ShuffleVTy->getElementType()); - Value *Data = SVI->getOperand(0); - Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0)); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // Note: Same VL as above, but i32 not xlen due to signature of - // vp.strided.store - VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, - {VTy, BasePtr->getType(), Stride->getType()}, - {Data, BasePtr, Stride, LaneMask, VL}); - Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes); - CI->addParamAttr(1, - Attribute::getWithAlignment(CI->getContext(), Alignment)); - return true; - } - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 5404123..7e58b6f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -68,6 +68,9 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { auto &Subtarget = MF->getSubtarget<RISCVSubtarget>(); if (MF->getFunction().getCallingConv() == CallingConv::GHC) return CSR_NoRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return Subtarget.hasStdExtE() ? CSR_RT_MostRegs_RVE_SaveList + : CSR_RT_MostRegs_SaveList; if (MF->getFunction().hasFnAttribute("interrupt")) { if (Subtarget.hasVInstructions()) { if (Subtarget.hasStdExtD()) @@ -573,6 +576,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int64_t Val = Offset.getFixed(); int64_t Lo12 = SignExtend64<12>(Val); unsigned Opc = MI.getOpcode(); + if (Opc == RISCV::ADDI && !isInt<12>(Val)) { // We chose to emit the canonical immediate sequence rather than folding // the offset into the using add under the theory that doing so doesn't @@ -585,6 +589,9 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, (Lo12 & 0b11111) != 0) { // Prefetch instructions require the offset to be 32 byte aligned. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); + } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) { + // MIPS Prefetch instructions require the offset to be 9 bits encoded. + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); } else if ((Opc == RISCV::PseudoRV32ZdinxLD || Opc == RISCV::PseudoRV32ZdinxSD) && Lo12 >= 2044) { @@ -811,7 +818,13 @@ RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF, if (CC == CallingConv::GHC) return CSR_NoRegs_RegMask; - switch (Subtarget.getTargetABI()) { + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + if (CC == CallingConv::PreserveMost) { + if (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E) + return CSR_RT_MostRegs_RVE_RegMask; + return CSR_RT_MostRegs_RegMask; + } + switch (ABI) { default: llvm_unreachable("Unrecognized ABI"); case RISCVABI::ABI_ILP32E: diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index e87f452..ccb39e8 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -268,6 +268,11 @@ def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)> { let DiagnosticString = "register must be a GPR excluding zero (x0)"; } +def GPRNoX2 : GPRRegisterClass<(sub GPR, X2)> { + let DiagnosticType = "InvalidRegClassGPRNoX2"; + let DiagnosticString = "register must be a GPR excluding sp (x2)"; +} + def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)> { let DiagnosticType = "InvalidRegClassGPRNoX0X2"; let DiagnosticString = "register must be a GPR excluding zero (x0) and sp (x2)"; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 3e286a7..bf23812 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0 bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); } +defvar SMX60VLEN = 256; +defvar SMX60DLEN = !div(SMX60VLEN, 2); + +class Get1248Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 2, + !eq(mx, "M4") : 4, + !eq(mx, "M8") : 8, + true: 1 + ); +} + +// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides +class Get4816Latency<string mx> { + int c = !cond( + !eq(mx, "M4") : 8, + !eq(mx, "M8") : 16, + true: 4 + ); +} + +// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max +class Get458Latency<string mx> { + int c = !cond( + !eq(mx, "M4") : 5, + !eq(mx, "M8") : 8, + true: 4 + ); +} + +// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs +// Used for: widening operations +class Get4588Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 5, + !eq(mx, "M4") : 8, + !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback + true: 4 + ); +} + +// Used for: mask-producing comparisons, carry ops with mask, FP comparisons +class Get461018Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 6, + !eq(mx, "M4") : 10, + !eq(mx, "M8") : 18, + true: 4 + ); +} + +// Used for: e64 multiply pattern, complex ops +class Get781632Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 8, + !eq(mx, "M4") : 16, + !eq(mx, "M8") : 32, + true: 7 + ); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -322,58 +383,96 @@ foreach LMul = [1, 2, 4, 8] in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; - - defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; - - defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; + } + + let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in { + // Pattern of vadd, vsub, vrsub: 4/4/5/8 + // Pattern of vand, vor, vxor: 4/4/8/16 + // They are grouped together, so we used the worst case 4/4/8/16 + // TODO: use InstRW to override individual instructions' scheduling data + defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; + } + + let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; + } + + // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8, + // e64 = 7,8,16,32. We use the worst-case until we can split the SEW. + // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites + let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in { + defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + } } // Widening +// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8 +// We use the worst-case for all. foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + } } -// Vector Integer Division and Remainder +// Division and remainder operations +// Pattern of vdivu: 11/11/11/20/40/80/160 +// Pattern of vdiv: 12/12/12/22/44/88/176 +// Pattern of vremu: 12/12/12/22/44/88/176 +// Pattern of vrem: 13/13/13/24/48/96/192 +// We use for all: 12/12/12/24/48/96/192 +// TODO: Create separate WriteVIRem to more closely match the latencies foreach mx = SchedMxList in { foreach sew = SchedSEWSet<mx>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + // Slightly reduced for fractional LMULs + defvar Multiplier = !cond( + !eq(mx, "MF8") : 12, + !eq(mx, "MF4") : 12, + !eq(mx, "MF2") : 12, + true: 24 + ); + + let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + } } } @@ -381,12 +480,21 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; + // Slightly increased for integer LMULs + defvar Multiplier = !cond( + !eq(mx, "M2") : 2, + !eq(mx, "M4") : 2, + true: 1 + ); + + let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; + } } // 12. Vector Fixed-Point Arithmetic Instructions diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index b43b915..da6ac2f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -104,11 +104,6 @@ static cl::opt<bool> EnablePostMISchedLoadStoreClustering( cl::desc("Enable PostRA load and store clustering in the machine scheduler"), cl::init(true)); -static cl::opt<bool> - EnableVLOptimizer("riscv-enable-vl-optimizer", - cl::desc("Enable the RISC-V VL Optimizer pass"), - cl::init(true), cl::Hidden); - static cl::opt<bool> DisableVectorMaskMutation( "riscv-disable-vector-mask-mutation", cl::desc("Disable the vector mask scheduling mutation"), cl::init(false), @@ -617,8 +612,7 @@ void RISCVPassConfig::addPreRegAlloc() { addPass(createRISCVPreRAExpandPseudoPass()); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createRISCVMergeBaseOffsetOptPass()); - if (EnableVLOptimizer) - addPass(createRISCVVLOptimizerPass()); + addPass(createRISCVVLOptimizerPass()); } addPass(createRISCVInsertReadWriteCSRPass()); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fd634b5..61dbd06 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1191,9 +1191,6 @@ static const CostTblEntry VectorIntrinsicCostTable[]{ {Intrinsic::roundeven, MVT::f64, 9}, {Intrinsic::rint, MVT::f32, 7}, {Intrinsic::rint, MVT::f64, 7}, - {Intrinsic::lrint, MVT::i32, 1}, - {Intrinsic::lrint, MVT::i64, 1}, - {Intrinsic::llrint, MVT::i64, 1}, {Intrinsic::nearbyint, MVT::f32, 9}, {Intrinsic::nearbyint, MVT::f64, 9}, {Intrinsic::bswap, MVT::i16, 3}, @@ -1251,11 +1248,48 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, switch (ICA.getID()) { case Intrinsic::lrint: case Intrinsic::llrint: - // We can't currently lower half or bfloat vector lrint/llrint. - if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]); - VecTy && VecTy->getElementType()->is16bitFPTy()) - return InstructionCost::getInvalid(); - [[fallthrough]]; + case Intrinsic::lround: + case Intrinsic::llround: { + auto LT = getTypeLegalizationCost(RetTy); + Type *SrcTy = ICA.getArgTypes().front(); + auto SrcLT = getTypeLegalizationCost(SrcTy); + if (ST->hasVInstructions() && LT.second.isVector()) { + SmallVector<unsigned, 2> Ops; + unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType()); + unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType()); + if (LT.second.getVectorElementType() == MVT::bf16) { + if (!ST->hasVInstructionsBF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V}; + } else if (LT.second.getVectorElementType() == MVT::f16 && + !ST->hasVInstructionsF16()) { + if (!ST->hasVInstructionsF16Minimal()) + return InstructionCost::getInvalid(); + if (DstEltSz == 32) + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V}; + else + Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V}; + + } else if (SrcEltSz > DstEltSz) { + Ops = {RISCV::VFNCVT_X_F_W}; + } else if (SrcEltSz < DstEltSz) { + Ops = {RISCV::VFWCVT_X_F_V}; + } else { + Ops = {RISCV::VFCVT_X_F_V}; + } + + // We need to use the source LMUL in the case of a narrowing op, and the + // destination LMUL otherwise. + if (SrcEltSz > DstEltSz) + return SrcLT.first * + getRISCVInstructionCost(Ops, SrcLT.second, CostKind); + return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind); + } + break; + } case Intrinsic::ceil: case Intrinsic::floor: case Intrinsic::trunc: diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index c1cc19b..050de3d 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -646,8 +646,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (!Src || Src->hasUnmodeledSideEffects() || Src->getParent() != MI.getParent() || !RISCVII::isFirstDefTiedToFirstUse(Src->getDesc()) || - !RISCVII::hasVLOp(Src->getDesc().TSFlags) || - !RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) + !RISCVII::hasVLOp(Src->getDesc().TSFlags)) return false; // Src's dest needs to have the same EEW as MI's input. @@ -681,12 +680,14 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { *Src->getParent()->getParent())); } - // If MI was tail agnostic and the VL didn't increase, preserve it. - int64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED; - if ((MI.getOperand(5).getImm() & RISCVVType::TAIL_AGNOSTIC) && - RISCV::isVLKnownLE(MI.getOperand(3), SrcVL)) - Policy |= RISCVVType::TAIL_AGNOSTIC; - Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy); + if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) { + // If MI was tail agnostic and the VL didn't increase, preserve it. + int64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED; + if ((MI.getOperand(5).getImm() & RISCVVType::TAIL_AGNOSTIC) && + RISCV::isVLKnownLE(MI.getOperand(3), SrcVL)) + Policy |= RISCVVType::TAIL_AGNOSTIC; + Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy); + } MRI->constrainRegClass(Src->getOperand(0).getReg(), MRI->getRegClass(MI.getOperand(0).getReg())); diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp index bbf1d87..cfe7ef4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp @@ -116,8 +116,8 @@ SPIRVTranslate(Module *M, std::string &SpirvObj, std::string &ErrMsg, PM.add(new TargetLibraryInfoWrapperPass(TLII)); std::unique_ptr<MachineModuleInfoWrapperPass> MMIWP( new MachineModuleInfoWrapperPass(Target.get())); - const_cast<TargetLoweringObjectFile *>(Target->getObjFileLowering()) - ->Initialize(MMIWP->getMMI().getContext(), *Target); + Target->getObjFileLowering()->Initialize(MMIWP->getMMI().getContext(), + *Target); SmallString<4096> OutBuffer; raw_svector_ostream OutStream(OutBuffer); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index b90e1aa..3c631ce 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -665,10 +665,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper( auto *HandleType = cast<TargetExtType>(II->getOperand(0)->getType()); if (HandleType->getTargetExtName() == "spirv.Image" || HandleType->getTargetExtName() == "spirv.SignedImage") { - if (II->hasOneUse()) { - auto *U = *II->users().begin(); + for (User *U : II->users()) { Ty = cast<Instruction>(U)->getAccessType(); - assert(Ty && "Unable to get type for resource pointer."); + if (Ty) + break; } } else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") { // This call is supposed to index into an array diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 5cda6a0..7505507 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -74,17 +74,20 @@ class SPIRVLegalizePointerCast : public FunctionPass { // Returns the loaded value. Value *loadVectorFromVector(IRBuilder<> &B, FixedVectorType *SourceType, FixedVectorType *TargetType, Value *Source) { - // We expect the codegen to avoid doing implicit bitcast from a load. - assert(TargetType->getElementType() == SourceType->getElementType()); - assert(TargetType->getNumElements() < SourceType->getNumElements()); - + assert(TargetType->getNumElements() <= SourceType->getNumElements()); LoadInst *NewLoad = B.CreateLoad(SourceType, Source); buildAssignType(B, SourceType, NewLoad); + Value *AssignValue = NewLoad; + if (TargetType->getElementType() != SourceType->getElementType()) { + AssignValue = B.CreateIntrinsic(Intrinsic::spv_bitcast, + {TargetType, SourceType}, {NewLoad}); + buildAssignType(B, TargetType, AssignValue); + } SmallVector<int> Mask(/* Size= */ TargetType->getNumElements()); for (unsigned I = 0; I < TargetType->getNumElements(); ++I) Mask[I] = I; - Value *Output = B.CreateShuffleVector(NewLoad, NewLoad, Mask); + Value *Output = B.CreateShuffleVector(AssignValue, AssignValue, Mask); buildAssignType(B, TargetType, Output); return Output; } @@ -135,8 +138,9 @@ class SPIRVLegalizePointerCast : public FunctionPass { Output = loadFirstValueFromAggregate(B, SVT->getElementType(), OriginalOperand, LI); } - // Destination is a smaller vector than source. + // Destination is a smaller vector than source or different vector type. // - float3 v3 = vector4; + // - float4 v2 = int4; else if (SVT && DVT) Output = loadVectorFromVector(B, SVT, DVT, OriginalOperand); // Destination is the scalar type stored at the start of an aggregate. diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 721f64a..1995e0f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -335,6 +335,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { getActionDefinitionsBuilder({G_SMULH, G_UMULH}).alwaysLegal(); } + getActionDefinitionsBuilder(G_IS_FPCLASS).custom(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -355,9 +357,14 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpvType, bool SPIRVLegalizerInfo::legalizeCustom( LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const { - auto Opc = MI.getOpcode(); MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - if (Opc == TargetOpcode::G_ICMP) { + switch (MI.getOpcode()) { + default: + // TODO: implement legalization for other opcodes. + return true; + case TargetOpcode::G_IS_FPCLASS: + return legalizeIsFPClass(Helper, MI, LocObserver); + case TargetOpcode::G_ICMP: { assert(GR->getSPIRVTypeForVReg(MI.getOperand(0).getReg())); auto &Op0 = MI.getOperand(2); auto &Op1 = MI.getOperand(3); @@ -378,6 +385,238 @@ bool SPIRVLegalizerInfo::legalizeCustom( } return true; } - // TODO: implement legalization for other opcodes. + } +} + +// Note this code was copied from LegalizerHelper::lowerISFPCLASS and adjusted +// to ensure that all instructions created during the lowering have SPIR-V types +// assigned to them. +bool SPIRVLegalizerInfo::legalizeIsFPClass( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); + FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm()); + + auto &MIRBuilder = Helper.MIRBuilder; + auto &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + Type *LLVMDstTy = + IntegerType::get(MIRBuilder.getContext(), DstTy.getScalarSizeInBits()); + if (DstTy.isVector()) + LLVMDstTy = VectorType::get(LLVMDstTy, DstTy.getElementCount()); + SPIRVType *SPIRVDstTy = GR->getOrCreateSPIRVType( + LLVMDstTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + unsigned BitSize = SrcTy.getScalarSizeInBits(); + const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType()); + + LLT IntTy = LLT::scalar(BitSize); + Type *LLVMIntTy = IntegerType::get(MIRBuilder.getContext(), BitSize); + if (SrcTy.isVector()) { + IntTy = LLT::vector(SrcTy.getElementCount(), IntTy); + LLVMIntTy = VectorType::get(LLVMIntTy, SrcTy.getElementCount()); + } + SPIRVType *SPIRVIntTy = GR->getOrCreateSPIRVType( + LLVMIntTy, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + + // Clang doesn't support capture of structured bindings: + LLT DstTyCopy = DstTy; + const auto assignSPIRVTy = [&](MachineInstrBuilder &&MI) { + // Assign this MI's (assumed only) destination to one of the two types we + // expect: either the G_IS_FPCLASS's destination type, or the integer type + // bitcast from the source type. + LLT MITy = MRI.getType(MI.getReg(0)); + assert((MITy == IntTy || MITy == DstTyCopy) && + "Unexpected LLT type while lowering G_IS_FPCLASS"); + auto *SPVTy = MITy == IntTy ? SPIRVIntTy : SPIRVDstTy; + GR->assignSPIRVTypeToVReg(SPVTy, MI.getReg(0), MF); + return MI; + }; + + // Helper to build and assign a constant in one go + const auto buildSPIRVConstant = [&](LLT Ty, auto &&C) -> MachineInstrBuilder { + if (!Ty.isFixedVector()) + return assignSPIRVTy(MIRBuilder.buildConstant(Ty, C)); + auto ScalarC = MIRBuilder.buildConstant(Ty.getScalarType(), C); + assert((Ty == IntTy || Ty == DstTyCopy) && + "Unexpected LLT type while lowering constant for G_IS_FPCLASS"); + SPIRVType *VecEltTy = GR->getOrCreateSPIRVType( + (Ty == IntTy ? LLVMIntTy : LLVMDstTy)->getScalarType(), MIRBuilder, + SPIRV::AccessQualifier::ReadWrite, + /*EmitIR*/ true); + GR->assignSPIRVTypeToVReg(VecEltTy, ScalarC.getReg(0), MF); + return assignSPIRVTy(MIRBuilder.buildSplatBuildVector(Ty, ScalarC)); + }; + + if (Mask == fcNone) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 0)); + MI.eraseFromParent(); + return true; + } + if (Mask == fcAllFlags) { + MIRBuilder.buildCopy(DstReg, buildSPIRVConstant(DstTy, 1)); + MI.eraseFromParent(); + return true; + } + + // Note that rather than creating a COPY here (between a floating-point and + // integer type of the same size) we create a SPIR-V bitcast immediately. We + // can't create a G_BITCAST because the LLTs are the same, and we can't seem + // to correctly lower COPYs to SPIR-V bitcasts at this moment. + Register ResVReg = MRI.createGenericVirtualRegister(IntTy); + MRI.setRegClass(ResVReg, GR->getRegClass(SPIRVIntTy)); + GR->assignSPIRVTypeToVReg(SPIRVIntTy, ResVReg, Helper.MIRBuilder.getMF()); + auto AsInt = MIRBuilder.buildInstr(SPIRV::OpBitcast) + .addDef(ResVReg) + .addUse(GR->getSPIRVTypeID(SPIRVIntTy)) + .addUse(SrcReg); + AsInt = assignSPIRVTy(std::move(AsInt)); + + // Various masks. + APInt SignBit = APInt::getSignMask(BitSize); + APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign. + APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit. + APInt ExpMask = Inf; + APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf; + APInt QNaNBitMask = + APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1); + APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits()); + + auto SignBitC = buildSPIRVConstant(IntTy, SignBit); + auto ValueMaskC = buildSPIRVConstant(IntTy, ValueMask); + auto InfC = buildSPIRVConstant(IntTy, Inf); + auto ExpMaskC = buildSPIRVConstant(IntTy, ExpMask); + auto ZeroC = buildSPIRVConstant(IntTy, 0); + + auto Abs = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC)); + auto Sign = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs)); + + auto Res = buildSPIRVConstant(DstTy, 0); + + const auto appendToRes = [&](MachineInstrBuilder &&ToAppend) { + Res = assignSPIRVTy( + MIRBuilder.buildOr(DstTyCopy, Res, assignSPIRVTy(std::move(ToAppend)))); + }; + + // Tests that involve more than one class should be processed first. + if ((Mask & fcFinite) == fcFinite) { + // finite(V) ==> abs(V) u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs, + ExpMaskC)); + Mask &= ~fcFinite; + } else if ((Mask & fcFinite) == fcPosFinite) { + // finite(V) && V > 0 ==> V u< exp_mask + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt, + ExpMaskC)); + Mask &= ~fcPosFinite; + } else if ((Mask & fcFinite) == fcNegFinite) { + // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1 + auto Cmp = assignSPIRVTy(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, + DstTy, Abs, ExpMaskC)); + appendToRes(MIRBuilder.buildAnd(DstTy, Cmp, Sign)); + Mask &= ~fcNegFinite; + } + + if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) { + // fcZero | fcSubnormal => test all exponent bits are 0 + // TODO: Handle sign bit specific cases + // TODO: Handle inverted case + if (PartialCheck == (fcZero | fcSubnormal)) { + auto ExpBits = assignSPIRVTy(MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC)); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + ExpBits, ZeroC)); + Mask &= ~PartialCheck; + } + } + + // Check for individual classes. + if (FPClassTest PartialCheck = Mask & fcZero) { + if (PartialCheck == fcPosZero) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, ZeroC)); + else if (PartialCheck == fcZero) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC)); + else // fcNegZero + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, SignBitC)); + } + + if (FPClassTest PartialCheck = Mask & fcSubnormal) { + // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set) + // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set) + auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs; + auto OneC = buildSPIRVConstant(IntTy, 1); + auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC); + auto SubnormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne, + buildSPIRVConstant(IntTy, AllOneMantissa))); + if (PartialCheck == fcNegSubnormal) + SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign); + appendToRes(std::move(SubnormalRes)); + } + + if (FPClassTest PartialCheck = Mask & fcInf) { + if (PartialCheck == fcPosInf) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, InfC)); + else if (PartialCheck == fcInf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC)); + else { // fcNegInf + APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt(); + auto NegInfC = buildSPIRVConstant(IntTy, NegInf); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + AsInt, NegInfC)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNan) { + auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask); + if (PartialCheck == fcNan) { + // isnan(V) ==> abs(V) u> int(inf) + appendToRes( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + } else if (PartialCheck == fcQNan) { + // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit) + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs, + InfWithQnanBitC)); + } else { // fcSNan + // issignaling(V) ==> abs(V) u> unsigned(Inf) && + // abs(V) u< (unsigned(Inf) | quiet_bit) + auto IsNan = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC)); + auto IsNotQnan = assignSPIRVTy(MIRBuilder.buildICmp( + CmpInst::Predicate::ICMP_ULT, DstTy, Abs, InfWithQnanBitC)); + appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan)); + } + } + + if (FPClassTest PartialCheck = Mask & fcNormal) { + // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u< + // (max_exp-1)) + APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); + auto ExpMinusOne = assignSPIRVTy( + MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB))); + APInt MaxExpMinusOne = ExpMask - ExpLSB; + auto NormalRes = assignSPIRVTy( + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne, + buildSPIRVConstant(IntTy, MaxExpMinusOne))); + if (PartialCheck == fcNegNormal) + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign); + else if (PartialCheck == fcPosNormal) { + auto PosSign = assignSPIRVTy(MIRBuilder.buildXor( + DstTy, Sign, buildSPIRVConstant(DstTy, InversionMask))); + NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign); + } + appendToRes(std::move(NormalRes)); + } + + MIRBuilder.buildCopy(DstReg, Res); + MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h index 6335f21..eeefa42 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h @@ -30,6 +30,10 @@ public: bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override; SPIRVLegalizerInfo(const SPIRVSubtarget &ST); + +private: + bool legalizeIsFPClass(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const; }; } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRVMACHINELEGALIZER_H diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp index 3ef6030..72bb372 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp @@ -69,8 +69,8 @@ void SystemZHLASMAsmStreamer::EmitEOL() { void SystemZHLASMAsmStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, - Subsection); + MAI->printSwitchToSection(*Section, Subsection, + getContext().getTargetTriple(), OS); MCStreamer::changeSection(Section, Subsection); } diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 19c9e9c..6ae69a4 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -900,7 +900,8 @@ public: bool checkDataSection() { if (CurrentState != DataSection) { - auto *WS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly()); + auto *WS = static_cast<const MCSectionWasm *>( + getStreamer().getCurrentSectionOnly()); if (WS && WS->isText()) return error("data directive must occur in a data segment: ", Lexer.getTok()); @@ -1218,7 +1219,8 @@ public: void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override { // Code below only applies to labels in text sections. - auto *CWS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly()); + auto *CWS = static_cast<const MCSectionWasm *>( + getStreamer().getCurrentSectionOnly()); if (!CWS->isText()) return; diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td index 13603f8..089be5f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -49,6 +49,8 @@ def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", "Enable FP16 instructions">; +def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">; + def FeatureMultiMemory : SubtargetFeature<"multimemory", "HasMultiMemory", "true", "Enable multiple memories">; @@ -136,13 +138,13 @@ def : ProcessorModel<"lime1", NoSchedModel, // Latest and greatest experimental version of WebAssembly. Bugs included! def : ProcessorModel<"bleeding-edge", NoSchedModel, - [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt, - FeatureCallIndirectOverlong, FeatureExceptionHandling, - FeatureExtendedConst, FeatureFP16, FeatureMultiMemory, - FeatureMultivalue, FeatureMutableGlobals, - FeatureNontrappingFPToInt, FeatureRelaxedSIMD, - FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt, - FeatureTailCall]>; + [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt, + FeatureCallIndirectOverlong, FeatureExceptionHandling, + FeatureExtendedConst, FeatureFP16, FeatureGC, + FeatureMultiMemory, FeatureMultivalue, FeatureMutableGlobals, + FeatureNontrappingFPToInt, FeatureRelaxedSIMD, + FeatureReferenceTypes, FeatureSIMD128, + FeatureSignExt, FeatureTailCall]>; //===----------------------------------------------------------------------===// // Target Declaration diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 11936a3..3f80b2a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -288,7 +288,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Expand float operations supported for scalars but not SIMD for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, - ISD::FEXP, ISD::FEXP2}) + ISD::FEXP, ISD::FEXP2, ISD::FEXP10}) for (auto T : {MVT::v4f32, MVT::v2f64}) setOperationAction(Op, T, Expand); @@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N, return SDValue(); } -static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::MUL); +static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (VT != MVT::v8i32 && VT != MVT::v16i32) return SDValue(); @@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performMulCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N->getOpcode() == ISD::MUL); + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + if (auto Res = TryWideExtMulCombine(N, DCI.DAG)) + return Res; + + // We don't natively support v16i8 mul, but we do support v8i16 so split the + // inputs and extend them to v8i16. Only do this before legalization in case + // a narrow vector is widened and may be simplified later. + if (!DCI.isBeforeLegalize() || VT != MVT::v16i8) + return SDValue(); + + SDLoc DL(N); + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue LowLHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS); + SDValue HighLHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS); + SDValue LowRHS = + DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS); + SDValue HighRHS = + DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS); + + SDValue MulLow = + DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS)); + SDValue MulHigh = DAG.getBitcast( + VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS)); + + // Take the low byte of each lane. + return DAG.getVectorShuffle( + VT, DL, MulLow, MulHigh, + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performLowerPartialReduction(N, DCI.DAG); } case ISD::MUL: - return performMulCombine(N, DCI.DAG); + return performMulCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index b5e723e..13d048a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -50,6 +50,9 @@ def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<(all_of FeatureFP16), "fp16">; +def HasGC : Predicate<"Subtarget->hasGC()">, + AssemblerPredicate<(all_of FeatureGC), "gc">; + def HasMultiMemory : Predicate<"Subtarget->hasMultiMemory()">, AssemblerPredicate<(all_of FeatureMultiMemory), "multimemory">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td index 40b87a0..fc82e5b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -36,13 +36,10 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> { Requires<[HasReferenceTypes]>; } -defm REF_TEST_FUNCREF : - I<(outs I32: $res), - (ins TypeIndex:$type, FUNCREF: $ref), - (outs), - (ins TypeIndex:$type), - [], - "ref.test\t$type, $ref", "ref.test $type", 0xfb14>; +defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref), + (outs), (ins TypeIndex:$type), [], + "ref.test\t$type, $ref", "ref.test $type", 0xfb14>, + Requires<[HasGC]>; defm "" : REF_I<FUNCREF, funcref, "func">; defm "" : REF_I<EXTERNREF, externref, "extern">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index d13862f..143298b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), + (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; } defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 28f6599..c3990d1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) { for (Instruction &I : BB) { if (I.getType()->isVoidTy()) continue; + + if (isa<AllocaInst>(&I)) { + // If the alloca has any lifetime marker that is no longer dominated + // by the alloca, remove all lifetime markers. Lifetime markers must + // always work directly on the alloca, and this is no longer possible. + bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) { + auto *UserI = cast<Instruction>(U); + return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI); + }); + if (HasNonDominatedLifetimeMarker) { + for (User *U : make_early_inc_range(I.users())) { + auto *UserI = cast<Instruction>(U); + if (UserI->isLifetimeStartOrEnd()) + UserI->eraseFromParent(); + } + } + } + unsigned VarID = SSA.AddVariable(I.getName(), I.getType()); // If a value is defined by an invoke instruction, it is only available in // its normal destination and not in its unwind destination. @@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // Setjmp preparation + SmallVector<AllocaInst *> StaticAllocas; + for (Instruction &I : F.getEntryBlock()) + if (auto *AI = dyn_cast<AllocaInst>(&I)) + if (AI->isStaticAlloca()) + StaticAllocas.push_back(AI); + BasicBlock *Entry = &F.getEntryBlock(); DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram()); SplitBlock(Entry, &*Entry->getFirstInsertionPt()); + // Move static allocas back into the entry block, so they stay static. + for (AllocaInst *AI : StaticAllocas) + AI->moveBefore(Entry->getTerminator()->getIterator()); + IRB.SetInsertPoint(Entry->getTerminator()->getIterator()); // This alloca'ed pointer is used by the runtime to identify function // invocations. It's just for pointer comparisons. It will never be diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp index 7912aeb..ffd135d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp @@ -63,8 +63,10 @@ void OptimizeReturned::visitCallBase(CallBase &CB) { if (isa<Constant>(Arg)) continue; // Like replaceDominatedUsesWith but using Instruction/Use dominance. - Arg->replaceUsesWithIf(&CB, - [&](Use &U) { return DT->dominates(&CB, U); }); + Arg->replaceUsesWithIf(&CB, [&](Use &U) { + auto *I = cast<Instruction>(U.getUser()); + return !I->isLifetimeStartOrEnd() && DT->dominates(&CB, U); + }); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp index 40ea48a..a3ce40f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp @@ -43,6 +43,11 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU, Bits.set(WebAssembly::FeatureBulkMemoryOpt); } + // gc implies reference-types + if (HasGC) { + HasReferenceTypes = true; + } + // reference-types implies call-indirect-overlong if (HasReferenceTypes) { HasCallIndirectOverlong = true; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index 591ce256..2f88bbb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -46,6 +46,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { bool HasExceptionHandling = false; bool HasExtendedConst = false; bool HasFP16 = false; + bool HasGC = false; bool HasMultiMemory = false; bool HasMultivalue = false; bool HasMutableGlobals = false; @@ -107,6 +108,7 @@ public: bool hasMutableGlobals() const { return HasMutableGlobals; } bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; } bool hasReferenceTypes() const { return HasReferenceTypes; } + bool hasGC() const { return HasGC; } bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; } bool hasSignExt() const { return HasSignExt; } bool hasSIMD128() const { return SIMDLevel >= SIMD128; } diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 8213e51..d7671ed 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -4803,7 +4803,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) { getStreamer().initSections(false, getSTI()); Section = getStreamer().getCurrentSectionOnly(); } - if (Section->useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(*Section)) getStreamer().emitCodeAlignment(Align(2), &getSTI(), 0); else getStreamer().emitValueToAlignment(Align(2), 0, 1, 0); diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 1bf9f8b..f9bd233 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -104,6 +104,7 @@ add_llvm_target(X86CodeGen ${sources} IRPrinter Instrumentation MC + ObjCARC ProfileData Scalar SelectionDAG diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index e213923..7f9d474 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -388,36 +388,6 @@ static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) { return false; } -/// Check if the instruction to be emitted is right after any data. -static bool -isRightAfterData(MCFragment *CurrentFragment, - const std::pair<MCFragment *, size_t> &PrevInstPosition) { - MCFragment *F = CurrentFragment; - // Since data is always emitted into a DataFragment, our check strategy is - // simple here. - // - If the fragment is a DataFragment - // - If it's empty (section start or data after align), return false. - // - If it's not the fragment where the previous instruction is, - // returns true. - // - If it's the fragment holding the previous instruction but its - // size changed since the previous instruction was emitted into - // it, returns true. - // - Otherwise returns false. - // - If the fragment is not a DataFragment, returns false. - if (F->getKind() == MCFragment::FT_Data) - return F->getFixedSize() && (F != PrevInstPosition.first || - F->getFixedSize() != PrevInstPosition.second); - - return false; -} - -/// \returns the fragment size if it has instructions, otherwise returns 0. -static size_t getSizeForInstFragment(const MCFragment *F) { - if (!F || !F->hasInstructions()) - return 0; - return F->getSize(); -} - /// Return true if we can insert NOP or prefixes automatically before the /// the instruction to be emitted. bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { @@ -441,9 +411,11 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { // semantic. return false; - if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition)) - // If this instruction follows any data, there is no clear - // instruction boundary, inserting a nop/prefix would change semantic. + // If this instruction follows any data, there is no clear instruction + // boundary, inserting a nop/prefix would change semantic. + auto Offset = OS.getCurFragSize(); + if (Offset && (OS.getCurrentFragment() != PrevInstPosition.first || + Offset != PrevInstPosition.second)) return false; return true; @@ -552,7 +524,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, // Update PrevInstOpcode here, canPadInst() reads that. MCFragment *CF = OS.getCurrentFragment(); PrevInstOpcode = Inst.getOpcode(); - PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); + PrevInstPosition = std::make_pair(CF, OS.getCurFragSize()); if (!canPadBranches(OS)) return; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 0ff7f23..067bd43 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3673,6 +3673,12 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CLI.NumResultRegs = RVLocs.size(); CLI.Call = MIB; + // Add call site info for call graph section. + if (TM.Options.EmitCallGraphSection && CB && CB->isIndirectCall()) { + MachineFunction::CallSiteInfo CSInfo(*CB); + MF->addCallSiteInfo(CLI.Call, std::move(CSInfo)); + } + return true; } @@ -4042,6 +4048,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, MO.setReg(IndexReg); } + if (MI->isCall()) + FuncInfo.MF->moveAdditionalCallInfo(MI, Result); Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); Result->cloneInstrSymbols(*FuncInfo.MF, *MI); MachineBasicBlock::iterator I(MI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 11ab8dc..bbbb1d9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58071,14 +58071,24 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) { Ops[3] = Op1.getOperand(0); Ops[4] = Op1.getOperand(1); } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) { + SDValue Src = Op1; + SDValue Op10 = Op1.getOperand(0); + if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) { + // res, flags2 = sub 0, (and (xor X, -1), Y) + // cload/cstore ..., cond_ne, flag2 + // -> + // res, flags2 = sub 0, (and X, Y) + // cload/cstore ..., cond_e, flag2 + Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0), + Op1.getOperand(1)); + Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8); + } // res, flags2 = sub 0, (and X, Y) - // cload/cstore ..., cond_ne, flag2 + // cload/cstore ..., cc, flag2 // -> - // res, flags2 = and X, Y - // cload/cstore ..., cond_ne, flag2 - Ops[4] = DAG.getNode(X86ISD::AND, DL, Sub->getVTList(), Op1.getOperand(0), - Op1.getOperand(1)) - .getValue(1); + // res, flags2 = cmp (and X, Y), 0 + // cload/cstore ..., cc, flag2 + Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0)); } else { return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b4639ac..5862c7e 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2060,6 +2060,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); + // Set type id for call site info. + if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall()) + CSInfo = MachineFunction::CallSiteInfo(*CB); + if (IsIndirectCall && !IsWin64 && M->getModuleFlag("import-call-optimization")) errorUnsupported(DAG, dl, diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index 620526ff..3f2a433 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -12,8 +12,52 @@ // NOTE: NO INCLUDE GUARD DESIRED! +#ifndef DUMMY_FUNCTION_PASS +#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) +#endif +DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) +DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this)) +DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) +DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) +#undef DUMMY_FUNCTION_PASS + #ifndef MACHINE_FUNCTION_PASS #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this)) #undef MACHINE_FUNCTION_PASS + +#ifndef DUMMY_MACHINE_FUNCTION_PASS +#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME) +#endif +DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization()) +DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS()) +DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment()) +DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander()) +DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix()) +DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-LEAs", FixupLEAPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-bw-inst", FixupBWInstPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-inst-tuning", X86FixupInstTuningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-setcc", X86FixupSetCCPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-vector-constants", X86FixupVectorConstantsPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-flags-copy-lowering", X86FlagsCopyLoweringPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-lower-tile-copy", X86LowerTileCopy()) +DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-load", X86LoadValueInjectionLoadHardeningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-ret", X86LoadValueInjectionRetHardeningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-optimize-LEAs", X86OptimizeLEAPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-pseudo", X86ExpandPseudo()) +DUMMY_MACHINE_FUNCTION_PASS("x86-return-thunks", X86ReturnThunks()) +DUMMY_MACHINE_FUNCTION_PASS("x86-seses", X86SpeculativeExecutionSideEffectSuppression()) +DUMMY_MACHINE_FUNCTION_PASS("x86-slh", X86SpeculativeLoadHardeningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-suppress-apx-for-relocation", X86SuppressAPXForRelocationPass()) +DUMMY_MACHINE_FUNCTION_PASS("tile-pre-config", X86PreTileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("tileconfig", X86TileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("x86-wineh-unwindv2", X86WinEHUnwindV2()) +DUMMY_MACHINE_FUNCTION_PASS("x86argumentstackrebase", X86ArgumentStackSlotPass()) +#undef DUMMY_MACHINE_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 37a7b37..90791fc 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1838,14 +1838,15 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX512BWShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb + { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb - { TTI::SK_Reverse, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw - { TTI::SK_Reverse, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw + { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw + { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw - { TTI::SK_Reverse, MVT::v64i8, { 2, 2, 2, 2 } }, // pshufb + vshufi64x2 + { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw + { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw @@ -1874,18 +1875,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX512ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd - {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss - {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq - {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd - {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb - - {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd - {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps - {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq - {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd + {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd + {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd + {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss + {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss + {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq + {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq + {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd + {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd + {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb + {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb + + {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd + {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps + {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq + {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca @@ -1973,21 +1981,24 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX2ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v4f64, { 1, 1, 1, 1 } }, // vbroadcastpd - { TTI::SK_Broadcast, MVT::v8f32, { 1, 1, 1, 1 } }, // vbroadcastps - { TTI::SK_Broadcast, MVT::v4i64, { 1, 1, 1, 1 } }, // vpbroadcastq - { TTI::SK_Broadcast, MVT::v8i32, { 1, 1, 1, 1 } }, // vpbroadcastd - { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v32i8, { 1, 1, 1, 1 } }, // vpbroadcastb - - { TTI::SK_Reverse, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd - { TTI::SK_Reverse, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps - { TTI::SK_Reverse, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq - { TTI::SK_Reverse, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd - { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb - { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb - { TTI::SK_Reverse, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb + { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd + { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb + { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb + + { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd + { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps + { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq + { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd + { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb + { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb + { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb @@ -2077,23 +2088,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX1ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Broadcast, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Broadcast, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Broadcast, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128 - {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128 - {TTI::SK_Broadcast, MVT::v32i8, {2,2,2,2}}, // vpshufb + vinsertf128 - - {TTI::SK_Reverse, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Reverse, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Reverse, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Reverse, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb + {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd + {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps + {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd + {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps + {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128 + + {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd + {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps + {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd + {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps + {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb // + vinsertf128 - {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb + {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb // + vinsertf128 - {TTI::SK_Reverse, MVT::v32i8, {4,4,4,4}}, // vextractf128 + 2*pshufb + {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb // + vinsertf128 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd @@ -2156,13 +2167,13 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry SSSE3ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb + {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb + {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb + {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb - {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb + {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb + {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb + {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por @@ -2192,16 +2203,16 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd - {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd - {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd - {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd - {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd - {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd - {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw + {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index 17c9833..d6afb8a 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -858,16 +858,15 @@ void RISCVISAInfo::updateImplication() { StringRef ExtName = WorkList.pop_back_val(); auto Range = std::equal_range(std::begin(ImpliedExts), std::end(ImpliedExts), ExtName); - std::for_each(Range.first, Range.second, - [&](const ImpliedExtsEntry &Implied) { - const char *ImpliedExt = Implied.ImpliedExt; - auto [It, Inserted] = Exts.try_emplace(ImpliedExt); - if (!Inserted) - return; - auto Version = findDefaultVersion(ImpliedExt); - It->second = *Version; - WorkList.push_back(ImpliedExt); - }); + for (const ImpliedExtsEntry &Implied : llvm::make_range(Range)) { + const char *ImpliedExt = Implied.ImpliedExt; + auto [It, Inserted] = Exts.try_emplace(ImpliedExt); + if (!Inserted) + continue; + auto Version = findDefaultVersion(ImpliedExt); + It->second = *Version; + WorkList.push_back(ImpliedExt); + } } // Add Zcd if C and D are enabled. diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 4ca7444..126be71 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -446,11 +446,13 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["tanh-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; + Features["bf16-cvt-insts"] = true; Features["fp8-conversion-insts"] = true; Features["fp8e5m3-insts"] = true; Features["permlane16-swap"] = true; Features["ashr-pk-insts"] = true; Features["atomic-buffer-pk-add-bf16-inst"] = true; + Features["vmem-pref-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["atomic-buffer-global-pk-add-f16-insts"] = true; Features["atomic-flat-pk-add-16-insts"] = true; diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index ee6651c..6acb0bc 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -277,6 +277,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case PC: return "pc"; case SCEI: return "scei"; case SUSE: return "suse"; + case Meta: + return "meta"; } llvm_unreachable("Invalid VendorType!"); @@ -390,6 +392,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) { case OpenHOS: return "ohos"; case PAuthTest: return "pauthtest"; + case MTIA: + return "mtia"; case LLVM: return "llvm"; case Mlibc: @@ -677,6 +681,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("suse", Triple::SUSE) .Case("oe", Triple::OpenEmbedded) .Case("intel", Triple::Intel) + .Case("meta", Triple::Meta) .Default(Triple::UnknownVendor); } @@ -780,6 +785,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { .StartsWith("pauthtest", Triple::PAuthTest) .StartsWith("llvm", Triple::LLVM) .StartsWith("mlibc", Triple::Mlibc) + .StartsWith("mtia", Triple::MTIA) .Default(Triple::UnknownEnvironment); } diff --git a/llvm/lib/TextAPI/SymbolSet.cpp b/llvm/lib/TextAPI/SymbolSet.cpp index 2e0b416..f21a061 100644 --- a/llvm/lib/TextAPI/SymbolSet.cpp +++ b/llvm/lib/TextAPI/SymbolSet.cpp @@ -11,6 +11,11 @@ using namespace llvm; using namespace llvm::MachO; +SymbolSet::~SymbolSet() { + for (auto &[Key, Sym] : Symbols) + Sym->~Symbol(); +} + Symbol *SymbolSet::addGlobalImpl(EncodeKind Kind, StringRef Name, SymbolFlags Flags) { Name = copyString(Name); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 64b33e4..ab906f9 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1568,7 +1568,7 @@ private: if (DebugLoc SuspendLoc = S->getDebugLoc()) { std::string LabelName = ("__coro_resume_" + Twine(SuspendIndex)).str(); - DILocation &DILoc = *SuspendLoc.get(); + DILocation &DILoc = *SuspendLoc; DILabel *ResumeLabel = DBuilder.createLabel(DIS, LabelName, DILoc.getFile(), SuspendLoc.getLine(), SuspendLoc.getCol(), diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp index b3910c4..d895cd7 100644 --- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp +++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp @@ -37,6 +37,16 @@ // memory that ends up in one of the runtime equivalents, since this can // happen if e.g. a library that was compiled without interposition returns // an allocation that can be validly passed to `free`. +// +// 3. MathFixup (required): Some accelerators might have an incomplete +// implementation for the intrinsics used to implement some of the math +// functions in <cmath> / their corresponding libcall lowerings. Since this +// can vary quite significantly between accelerators, we replace calls to a +// set of intrinsics / lib functions known to be problematic with calls to a +// HIPSTDPAR specific forwarding layer, which gives an uniform interface for +// accelerators to implement in their own runtime components. This pass +// should run before AcceleratorCodeSelection so as to prevent the spurious +// removal of the HIPSTDPAR specific forwarding functions. //===----------------------------------------------------------------------===// #include "llvm/Transforms/HipStdPar/HipStdPar.h" @@ -49,6 +59,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -519,3 +530,110 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { return PreservedAnalyses::none(); } + +static constexpr std::pair<StringLiteral, StringLiteral> MathLibToHipStdPar[]{ + {"acosh", "__hipstdpar_acosh_f64"}, + {"acoshf", "__hipstdpar_acosh_f32"}, + {"asinh", "__hipstdpar_asinh_f64"}, + {"asinhf", "__hipstdpar_asinh_f32"}, + {"atanh", "__hipstdpar_atanh_f64"}, + {"atanhf", "__hipstdpar_atanh_f32"}, + {"cbrt", "__hipstdpar_cbrt_f64"}, + {"cbrtf", "__hipstdpar_cbrt_f32"}, + {"erf", "__hipstdpar_erf_f64"}, + {"erff", "__hipstdpar_erf_f32"}, + {"erfc", "__hipstdpar_erfc_f64"}, + {"erfcf", "__hipstdpar_erfc_f32"}, + {"fdim", "__hipstdpar_fdim_f64"}, + {"fdimf", "__hipstdpar_fdim_f32"}, + {"expm1", "__hipstdpar_expm1_f64"}, + {"expm1f", "__hipstdpar_expm1_f32"}, + {"hypot", "__hipstdpar_hypot_f64"}, + {"hypotf", "__hipstdpar_hypot_f32"}, + {"ilogb", "__hipstdpar_ilogb_f64"}, + {"ilogbf", "__hipstdpar_ilogb_f32"}, + {"lgamma", "__hipstdpar_lgamma_f64"}, + {"lgammaf", "__hipstdpar_lgamma_f32"}, + {"log1p", "__hipstdpar_log1p_f64"}, + {"log1pf", "__hipstdpar_log1p_f32"}, + {"logb", "__hipstdpar_logb_f64"}, + {"logbf", "__hipstdpar_logb_f32"}, + {"nextafter", "__hipstdpar_nextafter_f64"}, + {"nextafterf", "__hipstdpar_nextafter_f32"}, + {"nexttoward", "__hipstdpar_nexttoward_f64"}, + {"nexttowardf", "__hipstdpar_nexttoward_f32"}, + {"remainder", "__hipstdpar_remainder_f64"}, + {"remainderf", "__hipstdpar_remainder_f32"}, + {"remquo", "__hipstdpar_remquo_f64"}, + {"remquof", "__hipstdpar_remquo_f32"}, + {"scalbln", "__hipstdpar_scalbln_f64"}, + {"scalblnf", "__hipstdpar_scalbln_f32"}, + {"scalbn", "__hipstdpar_scalbn_f64"}, + {"scalbnf", "__hipstdpar_scalbn_f32"}, + {"tgamma", "__hipstdpar_tgamma_f64"}, + {"tgammaf", "__hipstdpar_tgamma_f32"}}; + +PreservedAnalyses HipStdParMathFixupPass::run(Module &M, + ModuleAnalysisManager &) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector<std::pair<Function *, std::string>> ToReplace; + for (auto &&F : M) { + if (!F.hasName()) + continue; + + StringRef N = F.getName(); + Intrinsic::ID ID = F.getIntrinsicID(); + + switch (ID) { + case Intrinsic::not_intrinsic: { + auto It = + find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; }); + if (It == std::cend(MathLibToHipStdPar)) + continue; + ToReplace.emplace_back(&F, It->second); + break; + } + case Intrinsic::acos: + case Intrinsic::asin: + case Intrinsic::atan: + case Intrinsic::atan2: + case Intrinsic::cosh: + case Intrinsic::modf: + case Intrinsic::sinh: + case Intrinsic::tan: + case Intrinsic::tanh: + break; + default: { + if (F.getReturnType()->isDoubleTy()) { + switch (ID) { + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::pow: + case Intrinsic::sin: + break; + default: + continue; + } + break; + } + continue; + } + } + + ToReplace.emplace_back(&F, N); + llvm::replace(ToReplace.back().second, '.', '_'); + StringRef Prefix = "llvm"; + ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar"); + } + for (auto &&[F, NewF] : ToReplace) + F->replaceAllUsesWith( + M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee()); + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index f43202e..8262c8c 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1863,7 +1863,6 @@ void AttributeInferer::run(const SCCNodeSet &SCCNodes, struct SCCNodesResult { SCCNodeSet SCCNodes; - bool HasUnknownCall; }; } // end anonymous namespace @@ -2227,29 +2226,13 @@ static void addWillReturn(const SCCNodeSet &SCCNodes, static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) { SCCNodesResult Res; - Res.HasUnknownCall = false; for (Function *F : Functions) { if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) || F->isPresplitCoroutine()) { - // Treat any function we're trying not to optimize as if it were an - // indirect call and omit it from the node set used below. - Res.HasUnknownCall = true; + // Omit any functions we're trying not to optimize from the set. continue; } - // Track whether any functions in this SCC have an unknown call edge. - // Note: if this is ever a performance hit, we can common it with - // subsequent routines which also do scans over the instructions of the - // function. - if (!Res.HasUnknownCall) { - for (Instruction &I : instructions(*F)) { - if (auto *CB = dyn_cast<CallBase>(&I)) { - if (!CB->getCalledFunction()) { - Res.HasUnknownCall = true; - break; - } - } - } - } + Res.SCCNodes.insert(F); } return Res; @@ -2282,15 +2265,10 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter, addColdAttrs(Nodes.SCCNodes, Changed); addWillReturn(Nodes.SCCNodes, Changed); addNoUndefAttrs(Nodes.SCCNodes, Changed); - - // If we have no external nodes participating in the SCC, we can deduce some - // more precise attributes as well. - if (!Nodes.HasUnknownCall) { - addNoAliasAttrs(Nodes.SCCNodes, Changed); - addNonNullAttrs(Nodes.SCCNodes, Changed); - inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed); - addNoRecurseAttrs(Nodes.SCCNodes, Changed); - } + addNoAliasAttrs(Nodes.SCCNodes, Changed); + addNonNullAttrs(Nodes.SCCNodes, Changed); + inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed); + addNoRecurseAttrs(Nodes.SCCNodes, Changed); // Finally, infer the maximal set of attributes from the ones we've inferred // above. This is handling the cases where one attribute on a signature diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 486205c..57844a1 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -502,8 +502,7 @@ class LowerTypeTestsModule { uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); TypeIdLowering importTypeId(StringRef TypeId); void importTypeTest(CallInst *CI); - void importFunction(Function *F, bool isJumpTableCanonical, - std::vector<GlobalAlias *> &AliasesToErase); + void importFunction(Function *F, bool isJumpTableCanonical); BitSetInfo buildBitSet(Metadata *TypeId, @@ -1103,9 +1102,8 @@ void LowerTypeTestsModule::maybeReplaceComdat(Function *F, // ThinLTO backend: the function F has a jump table entry; update this module // accordingly. isJumpTableCanonical describes the type of the jump table entry. -void LowerTypeTestsModule::importFunction( - Function *F, bool isJumpTableCanonical, - std::vector<GlobalAlias *> &AliasesToErase) { +void LowerTypeTestsModule::importFunction(Function *F, + bool isJumpTableCanonical) { assert(F->getType()->getAddressSpace() == 0); GlobalValue::VisibilityTypes Visibility = F->getVisibility(); @@ -1135,23 +1133,23 @@ void LowerTypeTestsModule::importFunction( } else { F->setName(Name + ".cfi"); maybeReplaceComdat(F, Name); - F->setLinkage(GlobalValue::ExternalLinkage); FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, F->getAddressSpace(), Name, &M); FDecl->setVisibility(Visibility); Visibility = GlobalValue::HiddenVisibility; - // Delete aliases pointing to this function, they'll be re-created in the - // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed - // will want to reset the aliasees first. + // Update aliases pointing to this function to also include the ".cfi" suffix, + // We expect the jump table entry to either point to the real function or an + // alias. Redirect all other users to the jump table entry. for (auto &U : F->uses()) { if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) { + std::string AliasName = A->getName().str() + ".cfi"; Function *AliasDecl = Function::Create( F->getFunctionType(), GlobalValue::ExternalLinkage, F->getAddressSpace(), "", &M); AliasDecl->takeName(A); A->replaceAllUsesWith(AliasDecl); - AliasesToErase.push_back(A); + A->setName(AliasName); } } } @@ -2077,16 +2075,13 @@ bool LowerTypeTestsModule::lower() { Decls.push_back(&F); } - std::vector<GlobalAlias *> AliasesToErase; { ScopedSaveAliaseesAndUsed S(M); for (auto *F : Defs) - importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase); + importFunction(F, /*isJumpTableCanonical*/ true); for (auto *F : Decls) - importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase); + importFunction(F, /*isJumpTableCanonical*/ false); } - for (GlobalAlias *GA : AliasesToErase) - GA->eraseFromParent(); return true; } @@ -2137,6 +2132,18 @@ bool LowerTypeTestsModule::lower() { if (auto Alias = dyn_cast<AliasSummary>(RefGVS.get())) AddressTaken.insert(Alias->getAliaseeGUID()); } + auto IsAddressTaken = [&](GlobalValue::GUID GUID) { + if (AddressTaken.count(GUID)) + return true; + auto VI = ExportSummary->getValueInfo(GUID); + if (!VI) + return false; + for (auto &I : VI.getSummaryList()) + if (auto Alias = dyn_cast<AliasSummary>(I.get())) + if (AddressTaken.count(Alias->getAliaseeGUID())) + return true; + return false; + }; for (auto *FuncMD : CfiFunctionsMD->operands()) { assert(FuncMD->getNumOperands() >= 2); StringRef FunctionName = @@ -2153,7 +2160,7 @@ bool LowerTypeTestsModule::lower() { // have no live references (and are not exported with cross-DSO CFI.) if (!ExportSummary->isGUIDLive(GUID)) continue; - if (!AddressTaken.count(GUID)) { + if (!IsAddressTaken(GUID)) { if (!CrossDsoCfi || Linkage != CFL_Definition) continue; @@ -2227,6 +2234,43 @@ bool LowerTypeTestsModule::lower() { } } + struct AliasToCreate { + Function *Alias; + std::string TargetName; + }; + std::vector<AliasToCreate> AliasesToCreate; + + // Parse alias data to replace stand-in function declarations for aliases + // with an alias to the intended target. + if (ExportSummary) { + if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) { + for (auto *AliasMD : AliasesMD->operands()) { + SmallVector<Function *> Aliases; + for (Metadata *MD : AliasMD->operands()) { + auto *MDS = dyn_cast<MDString>(MD); + if (!MDS) + continue; + StringRef AliasName = MDS->getString(); + if (!ExportedFunctions.count(AliasName)) + continue; + auto *AliasF = M.getFunction(AliasName); + if (AliasF) + Aliases.push_back(AliasF); + } + + if (Aliases.empty()) + continue; + + for (unsigned I = 1; I != Aliases.size(); ++I) { + auto *AliasF = Aliases[I]; + ExportedFunctions.erase(AliasF->getName()); + AliasesToCreate.push_back( + {AliasF, std::string(Aliases[0]->getName())}); + } + } + } + } + DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers; for (GlobalObject &GO : M.global_objects()) { if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker()) @@ -2414,47 +2458,16 @@ bool LowerTypeTestsModule::lower() { allocateByteArrays(); - // Parse alias data to replace stand-in function declarations for aliases - // with an alias to the intended target. - if (ExportSummary) { - if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) { - for (auto *AliasMD : AliasesMD->operands()) { - assert(AliasMD->getNumOperands() >= 4); - StringRef AliasName = - cast<MDString>(AliasMD->getOperand(0))->getString(); - StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString(); - - if (auto It = ExportedFunctions.find(Aliasee); - It == ExportedFunctions.end() || - It->second.Linkage != CFL_Definition || !M.getNamedAlias(Aliasee)) - continue; - - GlobalValue::VisibilityTypes Visibility = - static_cast<GlobalValue::VisibilityTypes>( - cast<ConstantAsMetadata>(AliasMD->getOperand(2)) - ->getValue() - ->getUniqueInteger() - .getZExtValue()); - bool Weak = - static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3)) - ->getValue() - ->getUniqueInteger() - .getZExtValue()); - - auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee)); - Alias->setVisibility(Visibility); - if (Weak) - Alias->setLinkage(GlobalValue::WeakAnyLinkage); - - if (auto *F = M.getFunction(AliasName)) { - Alias->takeName(F); - F->replaceAllUsesWith(Alias); - F->eraseFromParent(); - } else { - Alias->setName(AliasName); - } - } - } + for (auto A : AliasesToCreate) { + auto *Target = M.getNamedValue(A.TargetName); + if (!isa<GlobalAlias>(Target)) + continue; + auto *AliasGA = GlobalAlias::create("", Target); + AliasGA->setVisibility(A.Alias->getVisibility()); + AliasGA->setLinkage(A.Alias->getLinkage()); + AliasGA->takeName(A.Alias); + A.Alias->replaceAllUsesWith(AliasGA); + A.Alias->eraseFromParent(); } // Emit .symver directives for exported functions, if they exist. diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index b803c97..c009c1e 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId, "Number of missing alloc nodes for context ids"); STATISTIC(SkippedCallsCloning, "Number of calls skipped during cloning due to unexpected operand"); +STATISTIC(MismatchedCloneAssignments, + "Number of callsites assigned to call multiple non-matching clones"); static cl::opt<std::string> DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -730,7 +732,7 @@ private: /// of the functions tracked calls to their new versions in the CallMap. /// Assigns new clones to clone number CloneNo. FuncInfo cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); @@ -897,7 +899,7 @@ private: CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map<CallInfo, CallInfo> &CallMap, + DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo); std::string getLabel(const Function *Func, const Instruction *Call, @@ -989,7 +991,7 @@ private: CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, IndexCall>::FuncInfo cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, - std::map<CallInfo, CallInfo> &CallMap, + DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo); std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, @@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) { return F.getName().contains(MemProfCloneSuffix); } +// Return the clone number of the given function by extracting it from the +// memprof suffix. Assumes the caller has already confirmed it is a memprof +// clone. +static unsigned getMemProfCloneNum(const Function &F) { + assert(isMemProfClone(F)); + auto Pos = F.getName().find_last_of('.'); + assert(Pos > 0); + unsigned CloneNo; + bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + assert(!Err); + (void)Err; + return CloneNo; +} + std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -2073,14 +2089,14 @@ std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, unsigned CloneNo) const { auto VI = FSToVIMap.find(Func); assert(VI != FSToVIMap.end()); + std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo); if (isa<AllocInfo *>(Call)) - return (VI->second.name() + " -> alloc").str(); + return CallerName + " -> alloc"; else { auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call); - return (VI->second.name() + " -> " + - getMemProfFuncName(Callsite->Callee.name(), - Callsite->Clones[CloneNo])) - .str(); + return CallerName + " -> " + + getMemProfFuncName(Callsite->Callee.name(), + Callsite->Clones[CloneNo]); } } @@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - if (CalleeFunc.cloneNo() > 0) + auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + if (isMemProfClone(*CurF)) { + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + auto CurCalleeCloneNo = getMemProfCloneNum(*CurF); + if (CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + } + if (NewCalleeCloneNo > 0) cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); OREGetter(CallerCall.call()->getFunction()) .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) @@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, assert(CI && "Caller cannot be an allocation which should not have profiled calls"); assert(CI->Clones.size() > CallerCall.cloneNo()); - CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + CurCalleeCloneNo = NewCalleeCloneNo; } // Update the debug information attached to NewFunc to use the clone Name. Note @@ -4019,7 +4062,7 @@ static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) { CallsiteContextGraph<ModuleCallsiteContextGraph, Function, Instruction *>::FuncInfo ModuleCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { // Use existing LLVM facilities for cloning and obtaining Call in clone ValueToValueMapTy VMap; @@ -4042,7 +4085,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite( CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, IndexCall>::FuncInfo IndexCallsiteContextGraph::cloneFunctionForCallsite( - FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, + FuncInfo &Func, CallInfo &Call, DenseMap<CallInfo, CallInfo> &CallMap, std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { // Check how many clones we have of Call (and therefore function). // The next clone number is the current size of versions array. @@ -4457,14 +4500,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; }; + // Information for a single clone of this Func. + struct FuncCloneInfo { + // The function clone. + FuncInfo FuncClone; + // Remappings of each call of interest (from original uncloned call to the + // corresponding cloned call in this function clone). + DenseMap<CallInfo, CallInfo> CallMap; + }; + // Walk all functions for which we saw calls with memprof metadata, and handle // cloning for each of its calls. for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { FuncInfo OrigFunc(Func); - // Map from each clone of OrigFunc to a map of remappings of each call of - // interest (from original uncloned call to the corresponding cloned call in - // that function clone). - std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; + // Map from each clone number of OrigFunc to information about that function + // clone (the function clone FuncInfo and call remappings). The index into + // the vector is the clone number, as function clones are created and + // numbered sequentially. + std::vector<FuncCloneInfo> FuncCloneInfos; for (auto &Call : CallsWithMetadata) { ContextNode *Node = getNodeForInst(Call); // Skip call if we do not have a node for it (all uses of its stack ids @@ -4488,8 +4541,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // Record the clone of callsite node assigned to this function clone. FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; - assert(FuncClonesToCallMap.count(FuncClone)); - std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; + assert(FuncCloneInfos.size() > FuncClone.cloneNo()); + DenseMap<CallInfo, CallInfo> &CallMap = + FuncCloneInfos[FuncClone.cloneNo()].CallMap; CallInfo CallClone(Call); if (auto It = CallMap.find(Call); It != CallMap.end()) CallClone = It->second; @@ -4528,10 +4582,10 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // than existing function clones, which would have been assigned to an // earlier clone in the list (we assign callsite clones to function // clones greedily). - if (FuncClonesToCallMap.size() < NodeCloneCount) { + if (FuncCloneInfos.size() < NodeCloneCount) { // If this is the first callsite copy, assign to original function. if (NodeCloneCount == 1) { - // Since FuncClonesToCallMap is empty in this case, no clones have + // Since FuncCloneInfos is empty in this case, no clones have // been created for this function yet, and no callers should have // been assigned a function clone for this callee node yet. assert(llvm::none_of( @@ -4540,7 +4594,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { })); // Initialize with empty call map, assign Clone to original function // and its callers, and skip to the next clone. - FuncClonesToCallMap[OrigFunc] = {}; + FuncCloneInfos.push_back( + {OrigFunc, DenseMap<CallInfo, CallInfo>()}); AssignCallsiteCloneToFuncClone( OrigFunc, Call, Clone, AllocationCallToContextNodeMap.count(Call)); @@ -4572,14 +4627,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { } // Clone function and save it along with the CallInfo map created - // during cloning in the FuncClonesToCallMap. - std::map<CallInfo, CallInfo> NewCallMap; - unsigned CloneNo = FuncClonesToCallMap.size(); + // during cloning in the FuncCloneInfos. + DenseMap<CallInfo, CallInfo> NewCallMap; + unsigned CloneNo = FuncCloneInfos.size(); assert(CloneNo > 0 && "Clone 0 is the original function, which " "should already exist in the map"); FuncInfo NewFuncClone = cloneFunctionForCallsite( OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); - FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); + FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)}); FunctionClonesAnalysis++; Changed = true; @@ -4680,8 +4735,8 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // CallMap is set up as indexed by original Call at clone 0. CallInfo OrigCall(Callee->getOrigNode()->Call); OrigCall.setCloneNo(0); - std::map<CallInfo, CallInfo> &CallMap = - FuncClonesToCallMap[NewFuncClone]; + DenseMap<CallInfo, CallInfo> &CallMap = + FuncCloneInfos[NewFuncClone.cloneNo()].CallMap; assert(CallMap.count(OrigCall)); CallInfo NewCall(CallMap[OrigCall]); assert(NewCall); @@ -4703,6 +4758,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // where the callers were assigned to different clones of a function. } + auto FindFirstAvailFuncClone = [&]() { + // Find first function in FuncCloneInfos without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncCloneInfos size was smaller than the clone number. + for (auto &CF : FuncCloneInfos) { + if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone)) + return CF.FuncClone; + } + llvm_unreachable( + "Expected an available func clone for this callsite clone"); + }; + // See if we can use existing function clone. Walk through // all caller edges to see if any have already been assigned to // a clone of this callsite's function. If we can use it, do so. If not, @@ -4819,16 +4887,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // clone of OrigFunc for another caller during this iteration over // its caller edges. if (!FuncCloneAssignedToCurCallsiteClone) { - // Find first function in FuncClonesToCallMap without an assigned - // clone of this callsite Node. We should always have one - // available at this point due to the earlier cloning when the - // FuncClonesToCallMap size was smaller than the clone number. - for (auto &CF : FuncClonesToCallMap) { - if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { - FuncCloneAssignedToCurCallsiteClone = CF.first; - break; - } - } + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); assert(FuncCloneAssignedToCurCallsiteClone); // Assign Clone to FuncCloneAssignedToCurCallsiteClone AssignCallsiteCloneToFuncClone( @@ -4842,6 +4901,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { FuncCloneAssignedToCurCallsiteClone); } } + // If we didn't assign a function clone to this callsite clone yet, e.g. + // none of its callers has a non-null call, do the assignment here. + // We want to ensure that every callsite clone is assigned to some + // function clone, so that the call updates below work as expected. + // In particular if this is the original callsite, we want to ensure it + // is assigned to the original function, otherwise the original function + // will appear available for assignment to other callsite clones, + // leading to unintended effects. For one, the unknown and not updated + // callers will call into cloned paths leading to the wrong hints, + // because they still call the original function (clone 0). Also, + // because all callsites start out as being clone 0 by default, we can't + // easily distinguish between callsites explicitly assigned to clone 0 + // vs those never assigned, which can lead to multiple updates of the + // calls when invoking updateCall below, with mismatched clone values. + // TODO: Add a flag to the callsite nodes or some other mechanism to + // better distinguish and identify callsite clones that are not getting + // assigned to function clones as expected. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); + assert(FuncCloneAssignedToCurCallsiteClone && + "No available func clone for this callsite clone"); + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call)); + } } if (VerifyCCG) { checkNode<DerivedCCG, FuncTy, CallTy>(Node); diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index e276376..4387c38 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -384,6 +384,10 @@ void splitAndWriteThinLTOBitcode( for (auto &F : M) if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F)) CfiFunctions.insert(&F); + for (auto &A : M.aliases()) + if (auto *F = dyn_cast<Function>(A.getAliasee())) + if (HasTypeMetadata(F)) + CfiFunctions.insert(&A); // Remove all globals with type metadata, globals with comdats that live in // MergedM, and aliases pointing to such globals from the thin LTO module. @@ -403,12 +407,12 @@ void splitAndWriteThinLTOBitcode( auto &Ctx = MergedM->getContext(); SmallVector<MDNode *, 8> CfiFunctionMDs; for (auto *V : CfiFunctions) { - Function &F = *cast<Function>(V); + Function &F = *cast<Function>(V->getAliaseeObject()); SmallVector<MDNode *, 2> Types; F.getMetadata(LLVMContext::MD_type, Types); SmallVector<Metadata *, 4> Elts; - Elts.push_back(MDString::get(Ctx, F.getName())); + Elts.push_back(MDString::get(Ctx, V->getName())); CfiFunctionLinkage Linkage; if (lowertypetests::isJumpTableCanonical(&F)) Linkage = CFL_Definition; @@ -428,29 +432,24 @@ void splitAndWriteThinLTOBitcode( NMD->addOperand(MD); } - SmallVector<MDNode *, 8> FunctionAliases; + MapVector<Function *, std::vector<GlobalAlias *>> FunctionAliases; for (auto &A : M.aliases()) { if (!isa<Function>(A.getAliasee())) continue; auto *F = cast<Function>(A.getAliasee()); - - Metadata *Elts[] = { - MDString::get(Ctx, A.getName()), - MDString::get(Ctx, F->getName()), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())), - }; - - FunctionAliases.push_back(MDTuple::get(Ctx, Elts)); + FunctionAliases[F].push_back(&A); } if (!FunctionAliases.empty()) { NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases"); - for (auto *MD : FunctionAliases) - NMD->addOperand(MD); + for (auto &Alias : FunctionAliases) { + SmallVector<Metadata *> Elts; + Elts.push_back(MDString::get(Ctx, Alias.first->getName())); + for (auto *A : Alias.second) + Elts.push_back(MDString::get(Ctx, A->getName())); + NMD->addOperand(MDTuple::get(Ctx, Elts)); + } } SmallVector<MDNode *, 8> Symvers; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b231c04..d7971e8 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -11,10 +11,13 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/FloatingPointPredicateUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" @@ -3589,6 +3592,154 @@ static Value *foldOrOfInversions(BinaryOperator &I, return nullptr; } +/// Match \p V as "shufflevector -> bitcast" or "extractelement -> zext -> shl" +/// patterns, which extract vector elements and pack them in the same relative +/// positions. +/// +/// \p Vec is the underlying vector being extracted from. +/// \p Mask is a bitmask identifying which packed elements are obtained from the +/// vector. +/// \p VecOffset is the vector element corresponding to index 0 of the +/// mask. +static bool matchSubIntegerPackFromVector(Value *V, Value *&Vec, + int64_t &VecOffset, + SmallBitVector &Mask, + const DataLayout &DL) { + static const auto m_ConstShlOrSelf = [](const auto &Base, uint64_t &ShlAmt) { + ShlAmt = 0; + return m_CombineOr(m_Shl(Base, m_ConstantInt(ShlAmt)), Base); + }; + + // First try to match extractelement -> zext -> shl + uint64_t VecIdx, ShlAmt; + if (match(V, m_ConstShlOrSelf(m_ZExtOrSelf(m_ExtractElt( + m_Value(Vec), m_ConstantInt(VecIdx))), + ShlAmt))) { + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + auto *EltTy = dyn_cast<IntegerType>(VecTy->getElementType()); + if (!EltTy) + return false; + + const unsigned EltBitWidth = EltTy->getBitWidth(); + const unsigned TargetBitWidth = V->getType()->getIntegerBitWidth(); + if (TargetBitWidth % EltBitWidth != 0 || ShlAmt % EltBitWidth != 0) + return false; + const unsigned TargetEltWidth = TargetBitWidth / EltBitWidth; + const unsigned ShlEltAmt = ShlAmt / EltBitWidth; + + const unsigned MaskIdx = + DL.isLittleEndian() ? ShlEltAmt : TargetEltWidth - ShlEltAmt - 1; + + VecOffset = static_cast<int64_t>(VecIdx) - static_cast<int64_t>(MaskIdx); + Mask.resize(TargetEltWidth); + Mask.set(MaskIdx); + return true; + } + + // Now try to match a bitcasted subvector. + Instruction *SrcVecI; + if (!match(V, m_BitCast(m_Instruction(SrcVecI)))) + return false; + + auto *SrcTy = dyn_cast<FixedVectorType>(SrcVecI->getType()); + if (!SrcTy) + return false; + + Mask.resize(SrcTy->getNumElements()); + + // First check for a subvector obtained from a shufflevector. + if (isa<ShuffleVectorInst>(SrcVecI)) { + Constant *ConstVec; + ArrayRef<int> ShuffleMask; + if (!match(SrcVecI, m_Shuffle(m_Value(Vec), m_Constant(ConstVec), + m_Mask(ShuffleMask)))) + return false; + + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + + const unsigned NumVecElts = VecTy->getNumElements(); + bool FoundVecOffset = false; + for (unsigned Idx = 0; Idx < ShuffleMask.size(); ++Idx) { + if (ShuffleMask[Idx] == PoisonMaskElem) + return false; + const unsigned ShuffleIdx = ShuffleMask[Idx]; + if (ShuffleIdx >= NumVecElts) { + const unsigned ConstIdx = ShuffleIdx - NumVecElts; + auto *ConstElt = + dyn_cast<ConstantInt>(ConstVec->getAggregateElement(ConstIdx)); + if (!ConstElt || !ConstElt->isNullValue()) + return false; + continue; + } + + if (FoundVecOffset) { + if (VecOffset + Idx != ShuffleIdx) + return false; + } else { + if (ShuffleIdx < Idx) + return false; + VecOffset = ShuffleIdx - Idx; + FoundVecOffset = true; + } + Mask.set(Idx); + } + return FoundVecOffset; + } + + // Check for a subvector obtained as an (insertelement V, 0, idx) + uint64_t InsertIdx; + if (!match(SrcVecI, + m_InsertElt(m_Value(Vec), m_Zero(), m_ConstantInt(InsertIdx)))) + return false; + + auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType()); + if (!VecTy) + return false; + VecOffset = 0; + bool AlreadyInsertedMaskedElt = Mask.test(InsertIdx); + Mask.set(); + if (!AlreadyInsertedMaskedElt) + Mask.reset(InsertIdx); + return true; +} + +/// Try to fold the join of two scalar integers whose contents are packed +/// elements of the same vector. +static Instruction *foldIntegerPackFromVector(Instruction &I, + InstCombiner::BuilderTy &Builder, + const DataLayout &DL) { + assert(I.getOpcode() == Instruction::Or); + Value *LhsVec, *RhsVec; + int64_t LhsVecOffset, RhsVecOffset; + SmallBitVector Mask; + if (!matchSubIntegerPackFromVector(I.getOperand(0), LhsVec, LhsVecOffset, + Mask, DL)) + return nullptr; + if (!matchSubIntegerPackFromVector(I.getOperand(1), RhsVec, RhsVecOffset, + Mask, DL)) + return nullptr; + if (LhsVec != RhsVec || LhsVecOffset != RhsVecOffset) + return nullptr; + + // Convert into shufflevector -> bitcast; + const unsigned ZeroVecIdx = + cast<FixedVectorType>(LhsVec->getType())->getNumElements(); + SmallVector<int> ShuffleMask(Mask.size(), ZeroVecIdx); + for (unsigned Idx : Mask.set_bits()) { + assert(LhsVecOffset + Idx >= 0); + ShuffleMask[Idx] = LhsVecOffset + Idx; + } + + Value *MaskedVec = Builder.CreateShuffleVector( + LhsVec, Constant::getNullValue(LhsVec->getType()), ShuffleMask, + I.getName() + ".v"); + return CastInst::Create(Instruction::BitCast, MaskedVec, I.getType()); +} + // A decomposition of ((X & Mask) * Factor). The NUW / NSW bools // track these properities for preservation. Note that we can decompose // equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask * @@ -3766,6 +3917,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Instruction *X = foldComplexAndOrPatterns(I, Builder)) return X; + if (Instruction *X = foldIntegerPackFromVector(I, Builder, DL)) + return X; + // (A & B) | (C & D) -> A ^ D where A == ~C && B == ~D // (A & B) | (C & D) -> A ^ C where A == ~D && B == ~C if (Value *V = foldOrOfInversions(I, Builder)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index d88bc2c..47e017e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1830,10 +1830,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { bool IntMinIsPoison = cast<Constant>(II->getArgOperand(1))->isOneValue(); // abs(-x) -> abs(x) - // TODO: Copy nsw if it was present on the neg? Value *X; - if (match(IIOperand, m_Neg(m_Value(X)))) + if (match(IIOperand, m_Neg(m_Value(X)))) { + if (cast<Instruction>(IIOperand)->hasNoSignedWrap() || IntMinIsPoison) + replaceOperand(*II, 1, Builder.getTrue()); return replaceOperand(*II, 0, X); + } if (match(IIOperand, m_c_Select(m_Neg(m_Value(X)), m_Deferred(X)))) return replaceOperand(*II, 0, X); @@ -3889,16 +3891,20 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } } - // Try to fold intrinsic into select operands. This is legal if: + // Try to fold intrinsic into select/phi operands. This is legal if: // * The intrinsic is speculatable. // * The select condition is not a vector, or the intrinsic does not // perform cross-lane operations. if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) && isNotCrossLaneOperation(II)) - for (Value *Op : II->args()) + for (Value *Op : II->args()) { if (auto *Sel = dyn_cast<SelectInst>(Op)) if (Instruction *R = FoldOpIntoSelect(*II, Sel)) return R; + if (auto *Phi = dyn_cast<PHINode>(Op)) + if (Instruction *R = foldOpIntoPhi(*II, Phi)) + return R; + } if (Instruction *Shuf = foldShuffledIntrinsicOperands(II)) return Shuf; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 033ef8b..a43a6ee 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -708,10 +708,14 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc, auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0)); if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) && all_equal(Shuf->getShuffleMask()) && - Shuf->getType() == Shuf->getOperand(0)->getType()) { + ElementCount::isKnownGE(Shuf->getType()->getElementCount(), + cast<VectorType>(Shuf->getOperand(0)->getType()) + ->getElementCount())) { // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask // trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask - Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType()); + Type *NewTruncTy = Shuf->getOperand(0)->getType()->getWithNewType( + Trunc.getType()->getScalarType()); + Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), NewTruncTy); return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask()); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index da9b126..b268fea 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -163,6 +163,11 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( LaterIndices.push_back(IdxVal); } + Value *Idx = GEP->getOperand(2); + // If the index type is non-canonical, wait for it to be canonicalized. + if (Idx->getType() != DL.getIndexType(GEP->getType())) + return nullptr; + enum { Overdefined = -3, Undefined = -2 }; // Variables for our state machines. @@ -290,17 +295,6 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal( // Now that we've scanned the entire array, emit our new comparison(s). We // order the state machines in complexity of the generated code. - Value *Idx = GEP->getOperand(2); - - // If the index is larger than the pointer offset size of the target, truncate - // the index down like the GEP would do implicitly. We don't have to do this - // for an inbounds GEP because the index can't be out of range. - if (!GEP->isInBounds()) { - Type *PtrIdxTy = DL.getIndexType(GEP->getType()); - unsigned OffsetSize = PtrIdxTy->getIntegerBitWidth(); - if (Idx->getType()->getPrimitiveSizeInBits().getFixedValue() > OffsetSize) - Idx = Builder.CreateTrunc(Idx, PtrIdxTy); - } // If inbounds keyword is not present, Idx * ElementSize can overflow. // Let's assume that ElementSize is 2 and the wanted value is at offset 0. diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index e2a9255..b992597 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1994,6 +1994,8 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN, } Clone = InsertNewInstBefore(Clone, OpBB->getTerminator()->getIterator()); Clones.insert({OpBB, Clone}); + // We may have speculated the instruction. + Clone->dropUBImplyingAttrsAndMetadata(); } NewPhiValues[OpIndex] = Clone; @@ -2777,6 +2779,12 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP, Indices.append(GEP.idx_begin()+1, GEP.idx_end()); } + // Don't create GEPs with more than one variable index. + unsigned NumVarIndices = + count_if(Indices, [](Value *Idx) { return !isa<Constant>(Idx); }); + if (NumVarIndices > 1) + return nullptr; + if (!Indices.empty()) return replaceInstUsesWith( GEP, Builder.CreateGEP( @@ -3199,6 +3207,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, NewGEP); } + // Strip trailing zero indices. + auto *LastIdx = dyn_cast<Constant>(Indices.back()); + if (LastIdx && LastIdx->isNullValue() && !LastIdx->getType()->isVectorTy()) { + return replaceInstUsesWith( + GEP, Builder.CreateGEP(GEP.getSourceElementType(), PtrOp, + drop_end(Indices), "", GEP.getNoWrapFlags())); + } + // Scalarize vector operands; prefer splat-of-gep.as canonical form. // Note that this looses information about undef lanes; we run it after // demanded bits to partially mitigate that loss. @@ -3225,6 +3241,30 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return replaceInstUsesWith(GEP, Res); } + bool SeenVarIndex = false; + for (auto [IdxNum, Idx] : enumerate(Indices)) { + if (isa<Constant>(Idx)) + continue; + + if (!SeenVarIndex) { + SeenVarIndex = true; + continue; + } + + // GEP has multiple variable indices: Split it. + ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum); + Value *FrontGEP = + Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices, + GEP.getName() + ".split", GEP.getNoWrapFlags()); + + SmallVector<Value *> BackIndices; + BackIndices.push_back(Constant::getNullValue(NewScalarIndexTy)); + append_range(BackIndices, drop_begin(Indices, IdxNum)); + return GetElementPtrInst::Create( + GetElementPtrInst::getIndexedType(GEPEltType, FrontIndices), FrontGEP, + BackIndices, GEP.getNoWrapFlags()); + } + // Check to see if the inputs to the PHI node are getelementptr instructions. if (auto *PN = dyn_cast<PHINode>(PtrOp)) { if (Value *NewPtrOp = foldGEPOfPhi(GEP, PN, Builder)) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 5849c3e..4e5a8d1 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -363,10 +363,10 @@ private: void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size); Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag); Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong); - bool instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag, + void instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag, const DominatorTree &DT, const PostDominatorTree &PDT, const LoopInfo &LI); - bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec); + void instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec); Value *getNextTagWithCall(IRBuilder<> &IRB); Value *getStackBaseTag(IRBuilder<> &IRB); Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, unsigned AllocaNo); @@ -1418,7 +1418,7 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { } } -bool HWAddressSanitizer::instrumentLandingPads( +void HWAddressSanitizer::instrumentLandingPads( SmallVectorImpl<Instruction *> &LandingPadVec) { for (auto *LP : LandingPadVec) { IRBuilder<> IRB(LP->getNextNode()); @@ -1427,10 +1427,9 @@ bool HWAddressSanitizer::instrumentLandingPads( {memtag::readRegister( IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" : "sp")}); } - return true; } -bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, +void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, Value *StackTag, Value *UARTag, const DominatorTree &DT, const PostDominatorTree &PDT, @@ -1460,8 +1459,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, size_t Size = memtag::getAllocaSizeInBytes(*AI); size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); - Value *AICast = IRB.CreatePointerCast(AI, PtrTy); - auto HandleLifetime = [&](IntrinsicInst *II) { // Set the lifetime intrinsic to cover the whole alloca. This reduces the // set of assumptions we need to make about the lifetime. Without this we @@ -1474,14 +1471,13 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, // one set of start / end in any execution (i.e. the ends are not // reachable from each other), so this will not cause any problems. II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize)); - II->setArgOperand(1, AICast); }; llvm::for_each(Info.LifetimeStart, HandleLifetime); llvm::for_each(Info.LifetimeEnd, HandleLifetime); - AI->replaceUsesWithIf(Replacement, [AICast, AILong](const Use &U) { + AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) { auto *User = U.getUser(); - return User != AILong && User != AICast && !isa<LifetimeIntrinsic>(User); + return User != AILong && !isa<LifetimeIntrinsic>(User); }); memtag::annotateDebugRecords(Info, retagMask(N)); @@ -1524,7 +1520,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, } memtag::alignAndPadAlloca(Info, Mapping.getObjectAlignment()); } - return true; } static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 854db0f..f451c2b 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -80,6 +80,27 @@ static cl::opt<unsigned> ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::desc("Skip Callsite up to this number for this compilation")); +// ICP the candidate function even when only a declaration is present. +static cl::opt<bool> ICPAllowDecls( + "icp-allow-decls", cl::init(false), cl::Hidden, + cl::desc("Promote the target candidate even when the defintion " + " is not available")); + +// ICP hot candidate functions only. When setting to false, non-cold functions +// (warm functions) can also be promoted. +static cl::opt<bool> + ICPAllowHotOnly("icp-allow-hot-only", cl::init(true), cl::Hidden, + cl::desc("Promote the target candidate only if it is a " + "hot function. Otherwise, warm functions can " + "also be promoted")); + +// If one target cannot be ICP'd, proceed with the remaining targets instead +// of exiting the callsite. +static cl::opt<bool> ICPAllowCandidateSkip( + "icp-allow-candidate-skip", cl::init(false), cl::Hidden, + cl::desc("Continue with the remaining targets instead of exiting " + "when failing in a candidate")); + // Set if the pass is called in LTO optimization. The difference for LTO mode // is the pass won't prefix the source module name to the internal linkage // symbols. @@ -330,6 +351,7 @@ private: struct PromotionCandidate { Function *const TargetFunction; const uint64_t Count; + const uint32_t Index; // The following fields only exists for promotion candidates with vtable // information. @@ -341,7 +363,8 @@ private: VTableGUIDCountsMap VTableGUIDAndCounts; SmallVector<Constant *> AddressPoints; - PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {} + PromotionCandidate(Function *F, uint64_t C, uint32_t I) + : TargetFunction(F), Count(C), Index(I) {} }; // Check if the indirect-call call site should be promoted. Return the number @@ -356,12 +379,10 @@ private: // Promote a list of targets for one indirect-call callsite by comparing // indirect callee with functions. Return true if there are IR // transformations and false otherwise. - bool tryToPromoteWithFuncCmp(CallBase &CB, Instruction *VPtr, - ArrayRef<PromotionCandidate> Candidates, - uint64_t TotalCount, - ArrayRef<InstrProfValueData> ICallProfDataRef, - uint32_t NumCandidates, - VTableGUIDCountsMap &VTableGUIDCounts); + bool tryToPromoteWithFuncCmp( + CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates, + uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef, + uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts); // Promote a list of targets for one indirect call by comparing vtables with // functions. Return true if there are IR transformations and false @@ -394,12 +415,15 @@ private: Constant *getOrCreateVTableAddressPointVar(GlobalVariable *GV, uint64_t AddressPointOffset); - void updateFuncValueProfiles(CallBase &CB, ArrayRef<InstrProfValueData> VDs, + void updateFuncValueProfiles(CallBase &CB, + MutableArrayRef<InstrProfValueData> VDs, uint64_t Sum, uint32_t MaxMDCount); void updateVPtrValueProfiles(Instruction *VPtr, VTableGUIDCountsMap &VTableGUIDCounts); + bool isValidTarget(uint64_t, Function *, const CallBase &, uint64_t); + public: IndirectCallPromoter( Function &Func, Module &M, InstrProfSymtab *Symtab, bool SamplePGO, @@ -419,6 +443,53 @@ public: } // end anonymous namespace +bool IndirectCallPromoter::isValidTarget(uint64_t Target, + Function *TargetFunction, + const CallBase &CB, uint64_t Count) { + // Don't promote if the symbol is not defined in the module. This avoids + // creating a reference to a symbol that doesn't exist in the module + // This can happen when we compile with a sample profile collected from + // one binary but used for another, which may have profiled targets that + // aren't used in the new binary. We might have a declaration initially in + // the case where the symbol is globally dead in the binary and removed by + // ThinLTO. + using namespace ore; + if (TargetFunction == nullptr) { + LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB) + << "Cannot promote indirect call: target with md5sum " + << NV("target md5sum", Target) + << " not found (count=" << NV("Count", Count) << ")"; + }); + return false; + } + if (!ICPAllowDecls && TargetFunction->isDeclaration()) { + LLVM_DEBUG(dbgs() << " Not promote: target definition is not available\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoTargetDef", &CB) + << "Do not promote indirect call: target with md5sum " + << NV("target md5sum", Target) + << " definition not available (count=" << ore::NV("Count", Count) + << ")"; + }); + return false; + } + + const char *Reason = nullptr; + if (!isLegalToPromote(CB, TargetFunction, &Reason)) { + + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB) + << "Cannot promote indirect call to " + << NV("TargetFunction", TargetFunction) + << " (count=" << NV("Count", Count) << "): " << Reason; + }); + return false; + } + return true; +} + // Indirect-call promotion heuristic. The direct targets are sorted based on // the count. Stop at the first target that is not promoted. std::vector<IndirectCallPromoter::PromotionCandidate> @@ -469,38 +540,15 @@ IndirectCallPromoter::getPromotionCandidatesForCallSite( break; } - // Don't promote if the symbol is not defined in the module. This avoids - // creating a reference to a symbol that doesn't exist in the module - // This can happen when we compile with a sample profile collected from - // one binary but used for another, which may have profiled targets that - // aren't used in the new binary. We might have a declaration initially in - // the case where the symbol is globally dead in the binary and removed by - // ThinLTO. Function *TargetFunction = Symtab->getFunction(Target); - if (TargetFunction == nullptr || TargetFunction->isDeclaration()) { - LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n"); - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB) - << "Cannot promote indirect call: target with md5sum " - << ore::NV("target md5sum", Target) << " not found"; - }); - break; - } - - const char *Reason = nullptr; - if (!isLegalToPromote(CB, TargetFunction, &Reason)) { - using namespace ore; - - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB) - << "Cannot promote indirect call to " - << NV("TargetFunction", TargetFunction) << " with count of " - << NV("Count", Count) << ": " << Reason; - }); - break; + if (!isValidTarget(Target, TargetFunction, CB, Count)) { + if (ICPAllowCandidateSkip) + continue; + else + break; } - Ret.push_back(PromotionCandidate(TargetFunction, Count)); + Ret.push_back(PromotionCandidate(TargetFunction, Count, I)); TotalCount -= Count; } return Ret; @@ -642,7 +690,7 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee, // Promote indirect-call to conditional direct-call for one callsite. bool IndirectCallPromoter::tryToPromoteWithFuncCmp( CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates, - uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef, + uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef, uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts) { uint32_t NumPromoted = 0; @@ -655,6 +703,8 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp( NumOfPGOICallPromotion++; NumPromoted++; + // Update the count and this entry will be erased later. + ICallProfDataRef[C.Index].Count = 0; if (!EnableVTableProfileUse || C.VTableGUIDAndCounts.empty()) continue; @@ -679,21 +729,33 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp( "Number of promoted functions should not be greater than the number " "of values in profile metadata"); - // Update value profiles on the indirect call. - updateFuncValueProfiles(CB, ICallProfDataRef.slice(NumPromoted), TotalCount, - NumCandidates); + updateFuncValueProfiles(CB, ICallProfDataRef, TotalCount, NumCandidates); updateVPtrValueProfiles(VPtr, VTableGUIDCounts); return true; } void IndirectCallPromoter::updateFuncValueProfiles( - CallBase &CB, ArrayRef<InstrProfValueData> CallVDs, uint64_t TotalCount, - uint32_t MaxMDCount) { + CallBase &CB, MutableArrayRef<InstrProfValueData> CallVDs, + uint64_t TotalCount, uint32_t MaxMDCount) { // First clear the existing !prof. CB.setMetadata(LLVMContext::MD_prof, nullptr); + + // Sort value profiles by count in descending order. + llvm::stable_sort(CallVDs, [](const InstrProfValueData &LHS, + const InstrProfValueData &RHS) { + return LHS.Count > RHS.Count; + }); + // Drop the <target-value, count> pair if count is zero. + ArrayRef<InstrProfValueData> VDs( + CallVDs.begin(), + llvm::upper_bound(CallVDs, 0U, + [](uint64_t Count, const InstrProfValueData &ProfData) { + return ProfData.Count <= Count; + })); + // Annotate the remaining value profiles if counter is not zero. if (TotalCount != 0) - annotateValueSite(M, CB, CallVDs, TotalCount, IPVK_IndirectCallTarget, + annotateValueSite(M, CB, VDs, TotalCount, IPVK_IndirectCallTarget, MaxMDCount); } @@ -726,7 +788,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp( uint64_t TotalFuncCount, uint32_t NumCandidates, MutableArrayRef<InstrProfValueData> ICallProfDataRef, VTableGUIDCountsMap &VTableGUIDCounts) { - SmallVector<uint64_t, 4> PromotedFuncCount; + SmallVector<std::pair<uint32_t, uint64_t>, 4> PromotedFuncCount; for (const auto &Candidate : Candidates) { for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts) @@ -771,7 +833,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp( return Remark; }); - PromotedFuncCount.push_back(Candidate.Count); + PromotedFuncCount.push_back({Candidate.Index, Candidate.Count}); assert(TotalFuncCount >= Candidate.Count && "Within one prof metadata, total count is the sum of counts from " @@ -792,22 +854,12 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp( // used to load multiple virtual functions. The vtable profiles needs to be // updated properly in that case (e.g, for each indirect call annotate both // type profiles and function profiles in one !prof). - for (size_t I = 0; I < PromotedFuncCount.size(); I++) - ICallProfDataRef[I].Count -= - std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count); - // Sort value profiles by count in descending order. - llvm::stable_sort(ICallProfDataRef, [](const InstrProfValueData &LHS, - const InstrProfValueData &RHS) { - return LHS.Count > RHS.Count; - }); - // Drop the <target-value, count> pair if count is zero. - ArrayRef<InstrProfValueData> VDs( - ICallProfDataRef.begin(), - llvm::upper_bound(ICallProfDataRef, 0U, - [](uint64_t Count, const InstrProfValueData &ProfData) { - return ProfData.Count <= Count; - })); - updateFuncValueProfiles(CB, VDs, TotalFuncCount, NumCandidates); + for (size_t I = 0; I < PromotedFuncCount.size(); I++) { + uint32_t Index = PromotedFuncCount[I].first; + ICallProfDataRef[Index].Count -= + std::max(PromotedFuncCount[I].second, ICallProfDataRef[Index].Count); + } + updateFuncValueProfiles(CB, ICallProfDataRef, TotalFuncCount, NumCandidates); updateVPtrValueProfiles(VPtr, VTableGUIDCounts); return true; } @@ -822,9 +874,22 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) { uint64_t TotalCount; auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction( CB, TotalCount, NumCandidates); - if (!NumCandidates || - (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount))) + if (!NumCandidates) continue; + if (PSI && PSI->hasProfileSummary()) { + // Don't promote cold candidates. + if (PSI->isColdCount(TotalCount)) { + LLVM_DEBUG(dbgs() << "Don't promote the cold candidate: TotalCount=" + << TotalCount << "\n"); + continue; + } + // Only pormote hot if ICPAllowHotOnly is true. + if (ICPAllowHotOnly && !PSI->isHotCount(TotalCount)) { + LLVM_DEBUG(dbgs() << "Don't promote the non-hot candidate: TotalCount=" + << TotalCount << "\n"); + continue; + } + } auto PromotionCandidates = getPromotionCandidatesForCallSite( *CB, ICallProfDataRef, TotalCount, NumCandidates); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index df31f07..54d9a83 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4769,6 +4769,79 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOriginForNaryOp(I); } + // Approximately handle AVX Galois Field Affine Transformation + // + // e.g., + // <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8) + // <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8) + // <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8) + // Out A x b + // where A and x are packed matrices, b is a vector, + // Out = A * x + b in GF(2) + // + // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix + // computation also includes a parity calculation. + // + // For the bitwise AND of bits V1 and V2, the exact shadow is: + // Out_Shadow = (V1_Shadow & V2_Shadow) + // | (V1 & V2_Shadow) + // | (V1_Shadow & V2 ) + // + // We approximate the shadow of gf2p8affineqb using: + // Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0) + // | gf2p8affineqb(x, A_shadow, 0) + // | gf2p8affineqb(x_Shadow, A, 0) + // | set1_epi8(b_Shadow) + // + // This approximation has false negatives: if an intermediate dot-product + // contains an even number of 1's, the parity is 0. + // It has no false positives. + void handleAVXGF2P8Affine(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + + assert(I.arg_size() == 3); + Value *A = I.getOperand(0); + Value *X = I.getOperand(1); + Value *B = I.getOperand(2); + + assert(isFixedIntVector(A)); + assert(cast<VectorType>(A->getType()) + ->getElementType() + ->getScalarSizeInBits() == 8); + + assert(A->getType() == X->getType()); + + assert(B->getType()->isIntegerTy()); + assert(B->getType()->getScalarSizeInBits() == 8); + + assert(I.getType() == A->getType()); + + Value *AShadow = getShadow(A); + Value *XShadow = getShadow(X); + Value *BZeroShadow = getCleanShadow(B); + + CallInst *AShadowXShadow = IRB.CreateIntrinsic( + I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow}); + CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {X, AShadow, BZeroShadow}); + CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), + {XShadow, A, BZeroShadow}); + + unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements(); + Value *BShadow = getShadow(B); + Value *BBroadcastShadow = getCleanShadow(AShadow); + // There is no LLVM IR intrinsic for _mm512_set1_epi8. + // This loop generates a lot of LLVM IR, which we expect that CodeGen will + // lower appropriately (e.g., VPBROADCASTB). + // Besides, b is often a constant, in which case it is fully initialized. + for (unsigned i = 0; i < NumElements; i++) + BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i); + + setShadow(&I, IRB.CreateOr( + {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow})); + setOriginForNaryOp(I); + } + // Handle Arm NEON vector load intrinsics (vld*). // // The WithLane instructions (ld[234]lane) are similar to: @@ -5604,6 +5677,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { break; } + // AVX Galois Field New Instructions + case Intrinsic::x86_vgf2p8affineqb_128: + case Intrinsic::x86_vgf2p8affineqb_256: + case Intrinsic::x86_vgf2p8affineqb_512: + handleAVXGF2P8Affine(I); + break; + case Intrinsic::fshl: case Intrinsic::fshr: handleFunnelShift(I); diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index f6780c0..ce1d9f1 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -456,7 +456,7 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, if (DisableMemOPOPT) return false; - if (F.hasFnAttribute(Attribute::OptimizeForSize)) + if (F.hasOptSize()) return false; MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); MemOPSizeOpt.perform(); diff --git a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt index 80867db..4274667 100644 --- a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt +++ b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt @@ -2,7 +2,6 @@ add_llvm_component_library(LLVMObjCARCOpts ObjCARC.cpp ObjCARCOpts.cpp ObjCARCExpand.cpp - ObjCARCAPElim.cpp ObjCARCContract.cpp DependencyAnalysis.cpp ProvenanceAnalysis.cpp diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp deleted file mode 100644 index dceb2eb..0000000 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// -/// This file defines ObjC ARC optimizations. ARC stands for Automatic -/// Reference Counting and is a system for managing reference counts for objects -/// in Objective C. -/// -/// This specific file implements optimizations which remove extraneous -/// autorelease pools. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/ObjCARCAnalysisUtils.h" -#include "llvm/Analysis/ObjCARCInstKind.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/ObjCARC.h" - -using namespace llvm; -using namespace llvm::objcarc; - -#define DEBUG_TYPE "objc-arc-ap-elim" - -namespace { - -/// Interprocedurally determine if calls made by the given call site can -/// possibly produce autoreleases. -bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) { - if (const Function *Callee = CB.getCalledFunction()) { - if (!Callee->hasExactDefinition()) - return true; - for (const BasicBlock &BB : *Callee) { - for (const Instruction &I : BB) - if (const CallBase *JCB = dyn_cast<CallBase>(&I)) - // This recursion depth limit is arbitrary. It's just great - // enough to cover known interesting testcases. - if (Depth < 3 && !JCB->onlyReadsMemory() && - MayAutorelease(*JCB, Depth + 1)) - return true; - } - return false; - } - - return true; -} - -bool OptimizeBB(BasicBlock *BB) { - bool Changed = false; - - Instruction *Push = nullptr; - for (Instruction &Inst : llvm::make_early_inc_range(*BB)) { - switch (GetBasicARCInstKind(&Inst)) { - case ARCInstKind::AutoreleasepoolPush: - Push = &Inst; - break; - case ARCInstKind::AutoreleasepoolPop: - // If this pop matches a push and nothing in between can autorelease, - // zap the pair. - if (Push && cast<CallInst>(&Inst)->getArgOperand(0) == Push) { - Changed = true; - LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop " - "autorelease pair:\n" - " Pop: " - << Inst << "\n" - << " Push: " << *Push - << "\n"); - Inst.eraseFromParent(); - Push->eraseFromParent(); - } - Push = nullptr; - break; - case ARCInstKind::CallOrUser: - if (MayAutorelease(cast<CallBase>(Inst))) - Push = nullptr; - break; - default: - break; - } - } - - return Changed; -} - -bool runImpl(Module &M) { - if (!EnableARCOpts) - return false; - - // If nothing in the Module uses ARC, don't do anything. - if (!ModuleHasARC(M)) - return false; - // Find the llvm.global_ctors variable, as the first step in - // identifying the global constructors. In theory, unnecessary autorelease - // pools could occur anywhere, but in practice it's pretty rare. Global - // ctors are a place where autorelease pools get inserted automatically, - // so it's pretty common for them to be unnecessary, and it's pretty - // profitable to eliminate them. - GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); - if (!GV) - return false; - - assert(GV->hasDefinitiveInitializer() && - "llvm.global_ctors is uncooperative!"); - - bool Changed = false; - - // Dig the constructor functions out of GV's initializer. - ConstantArray *Init = cast<ConstantArray>(GV->getInitializer()); - for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end(); - OI != OE; ++OI) { - Value *Op = *OI; - // llvm.global_ctors is an array of three-field structs where the second - // members are constructor functions. - Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); - // If the user used a constructor function with the wrong signature and - // it got bitcasted or whatever, look the other way. - if (!F) - continue; - // Only look at function definitions. - if (F->isDeclaration()) - continue; - // Only look at functions with one basic block. - if (std::next(F->begin()) != F->end()) - continue; - // Ok, a single-block constructor function definition. Try to optimize it. - Changed |= OptimizeBB(&F->front()); - } - - return Changed; -} - -} // namespace - -PreservedAnalyses ObjCARCAPElimPass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runImpl(M)) - return PreservedAnalyses::all(); - PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); - return PA; -} diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 0f63ed0..9b87180 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -1360,13 +1360,10 @@ struct DSEState { /// indicating whether \p I is a free-like call. std::optional<std::pair<MemoryLocation, bool>> getLocForTerminator(Instruction *I) const { - uint64_t Len; - Value *Ptr; - if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len), - m_Value(Ptr)))) - return {std::make_pair(MemoryLocation(Ptr, Len), false)}; - if (auto *CB = dyn_cast<CallBase>(I)) { + if (CB->getIntrinsicID() == Intrinsic::lifetime_end) + return { + std::make_pair(MemoryLocation::getForArgument(CB, 1, &TLI), false)}; if (Value *FreedOp = getFreedOperand(CB, &TLI)) return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)}; } diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index e706a6f..deff79b 100644 --- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -237,8 +237,7 @@ class InductiveRangeCheckElimination { DominatorTree &DT; LoopInfo &LI; - using GetBFIFunc = - std::optional<llvm::function_ref<llvm::BlockFrequencyInfo &()>>; + using GetBFIFunc = llvm::function_ref<llvm::BlockFrequencyInfo &()>; GetBFIFunc GetBFI; // Returns the estimated number of iterations based on block frequency info if @@ -249,7 +248,7 @@ class InductiveRangeCheckElimination { public: InductiveRangeCheckElimination(ScalarEvolution &SE, BranchProbabilityInfo *BPI, DominatorTree &DT, - LoopInfo &LI, GetBFIFunc GetBFI = std::nullopt) + LoopInfo &LI, GetBFIFunc GetBFI = nullptr) : SE(SE), BPI(BPI), DT(DT), LI(LI), GetBFI(GetBFI) {} bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop); @@ -959,7 +958,7 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) { std::optional<uint64_t> InductiveRangeCheckElimination::estimatedTripCount(const Loop &L) { if (GetBFI) { - BlockFrequencyInfo &BFI = (*GetBFI)(); + BlockFrequencyInfo &BFI = GetBFI(); uint64_t hFreq = BFI.getBlockFreq(L.getHeader()).getFrequency(); uint64_t phFreq = BFI.getBlockFreq(L.getLoopPreheader()).getFrequency(); if (phFreq == 0 || hFreq == 0) diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index d6bd92d..b5eb647 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1176,6 +1176,28 @@ private: return true; } + /// This function fixes PHI nodes after fusion in \p SafeToSink. + /// \p SafeToSink instructions are the instructions that are to be moved past + /// the fused loop. Thus, the PHI nodes in \p SafeToSink should be updated to + /// receive values from the fused loop if they are currently taking values + /// from the first loop (i.e. FC0)'s latch. + void fixPHINodes(ArrayRef<Instruction *> SafeToSink, + const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + for (Instruction *Inst : SafeToSink) { + // No update needed for non-PHI nodes. + PHINode *Phi = dyn_cast<PHINode>(Inst); + if (!Phi) + continue; + for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) { + if (Phi->getIncomingBlock(I) != FC0.Latch) + continue; + assert(FC1.Latch && "FC1 latch is not set"); + Phi->setIncomingBlock(I, FC1.Latch); + } + } + } + /// Collect instructions in the \p FC1 Preheader that can be hoisted /// to the \p FC0 Preheader or sunk into the \p FC1 Body bool collectMovablePreheaderInsts( @@ -1481,6 +1503,9 @@ private: assert(I->getParent() == FC1.Preheader); I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt()); } + // PHI nodes in SinkInsts need to be updated to receive values from the + // fused loop. + fixPHINodes(SinkInsts, FC0, FC1); } /// Determine if two fusion candidates have identical guards diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 70e9eee..08446cc 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -17,8 +17,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSet.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -70,6 +70,13 @@ namespace { using LoopVector = SmallVector<Loop *, 8>; +/// A list of direction vectors. Each entry represents a direction vector +/// corresponding to one or more dependencies existing in the loop nest. The +/// length of all direction vectors is equal and is N + 1, where N is the depth +/// of the loop nest. The first N elements correspond to the dependency +/// direction of each N loops. The last one indicates whether this entry is +/// forward dependency ('<') or not ('*'). The term "forward" aligns with what +/// is defined in LoopAccessAnalysis. // TODO: Check if we can use a sparse matrix here. using CharMatrix = std::vector<std::vector<char>>; @@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) { static void printDepMatrix(CharMatrix &DepMatrix) { for (auto &Row : DepMatrix) { - for (auto D : Row) + // Drop the last element because it is a flag indicating whether this is + // forward dependency or not, which doesn't affect the legality check. + for (char D : drop_end(Row)) LLVM_DEBUG(dbgs() << D << " "); LLVM_DEBUG(dbgs() << "\n"); } } + +/// Return true if \p Src appears before \p Dst in the same basic block. +/// Precondition: \p Src and \Dst are distinct instructions within the same +/// basic block. +static bool inThisOrder(const Instruction *Src, const Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && Src != Dst && + "Expected Src and Dst to be different instructions in the same BB"); + + bool FoundSrc = false; + for (const Instruction &I : *(Src->getParent())) { + if (&I == Src) { + FoundSrc = true; + continue; + } + if (&I == Dst) + return FoundSrc; + } + + llvm_unreachable("Dst not found"); +} #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, @@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, return false; } ValueVector::iterator I, IE, J, JE; - StringSet<> Seen; + + // Manage direction vectors that are already seen. Map each direction vector + // to an index of DepMatrix at which it is stored. + StringMap<unsigned> Seen; for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { for (J = I, JE = MemInstr.end(); J != JE; ++J) { @@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Dep.push_back('I'); } + // Test whether the dependency is forward or not. + bool IsKnownForward = true; + if (Src->getParent() != Dst->getParent()) { + // In general, when Src and Dst are in different BBs, the execution + // order of them within a single iteration is not guaranteed. Treat + // conservatively as not-forward dependency in this case. + IsKnownForward = false; + } else { + // Src and Dst are in the same BB. If they are the different + // instructions, Src should appear before Dst in the BB as they are + // stored to MemInstr in that order. + assert((Src == Dst || inThisOrder(Src, Dst)) && + "Unexpected instructions"); + + // If the Dependence object is reversed (due to normalization), it + // represents the dependency from Dst to Src, meaning it is a backward + // dependency. Otherwise it should be a forward dependency. + bool IsReversed = D->getSrc() != Src; + if (IsReversed) + IsKnownForward = false; + } + + // Initialize the last element. Assume forward dependencies only; it + // will be updated later if there is any non-forward dependency. + Dep.push_back('<'); + + // The last element should express the "summary" among one or more + // direction vectors whose first N elements are the same (where N is + // the depth of the loop nest). Hence we exclude the last element from + // the Seen map. + auto [Ite, Inserted] = Seen.try_emplace( + StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size()); + // Make sure we only add unique entries to the dependency matrix. - if (Seen.insert(StringRef(Dep.data(), Dep.size())).second) + if (Inserted) DepMatrix.push_back(Dep); + + // If we cannot prove that this dependency is forward, change the last + // element of the corresponding entry. Since a `[... *]` dependency + // includes a `[... <]` dependency, we do not need to keep both and + // change the existing entry instead. + if (!IsKnownForward) + DepMatrix[Ite->second].back() = '*'; } } } @@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, continue; // Check if the direction vector is lexicographically positive (or zero) - // for both before/after exchanged. - if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) + // for both before/after exchanged. Ignore the last element because it + // doesn't affect the legality. + if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) return false; std::swap(Cur[InnerLoopId], Cur[OuterLoopId]); - if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) + if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) return false; } return true; @@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { for (const auto &Dep : DepMatrix) { char Dir = Dep[LoopId]; - if (Dir != 'I' && Dir != '=') - return false; + char DepType = Dep.back(); + assert((DepType == '<' || DepType == '*') && + "Unexpected element in dependency vector"); + + // There are no loop-carried dependencies. + if (Dir == '=' || Dir == 'I') + continue; + + // DepType being '<' means that this direction vector represents a forward + // dependency. In principle, a loop with '<' direction can be vectorized in + // this case. + if (Dir == '<' && DepType == '<') + continue; + + // We cannot prove that the loop is vectorizable. + return false; } return true; } std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // If the outer loop is not loop independent it is not profitable to move - // this to inner position, since doing so would not enable inner loop - // parallelism. + // If the outer loop cannot be vectorized, it is not profitable to move this + // to inner position. if (!canVectorize(DepMatrix, OuterLoopId)) return false; - // If inner loop has dependence and outer loop is loop independent then it is - // profitable to interchange to enable inner loop parallelism. + // If the inner loop cannot be vectorized but the outer loop can be, then it + // is profitable to interchange to enable inner loop parallelism. if (!canVectorize(DepMatrix, InnerLoopId)) return true; diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 9e318b0..e3ef9d8 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3785,7 +3785,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { // Ignore icmp instructions which are already being analyzed. if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { unsigned OtherIdx = !U.getOperandNo(); - Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); + Value *OtherOp = ICI->getOperand(OtherIdx); if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) continue; } diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 84d1c0b..9220abb 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1593,11 +1593,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, // since both llvm.lifetime.start and llvm.lifetime.end intrinsics // practically fill all the bytes of the alloca with an undefined // value, although conceptually marked as alive/dead. - int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue(); - if (Size < 0 || Size == DestSize) { - LifetimeMarkers.push_back(UI); - continue; - } + LifetimeMarkers.push_back(UI); + continue; } AAMetadataInstrs.insert(UI); @@ -1614,9 +1611,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - // Check that dest has no Mod/Ref, from the alloca to the Store, except full - // size lifetime intrinsics. And collect modref inst for the reachability - // check. + // Check that dest has no Mod/Ref, from the alloca to the Store. And collect + // modref inst for the reachability check. ModRefInfo DestModRef = ModRefInfo::NoModRef; MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size)); SmallVector<BasicBlock *, 8> ReachabilityWorklist; diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 6a3f656..1a52af1 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -651,7 +651,7 @@ class NewGVN { BitVector TouchedInstructions; DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; - mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred; + mutable DenseMap<const IntrinsicInst *, const Value *> PredicateSwapChoice; #ifndef NDEBUG // Debugging for how many times each block and instruction got processed. @@ -840,7 +840,7 @@ private: // Ranking unsigned int getRank(const Value *) const; bool shouldSwapOperands(const Value *, const Value *) const; - bool shouldSwapOperandsForIntrinsic(const Value *, const Value *, + bool shouldSwapOperandsForPredicate(const Value *, const Value *, const IntrinsicInst *I) const; // Reachability handling. @@ -1624,7 +1624,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const { Value *AdditionallyUsedValue = CmpOp0; // Sort the ops. - if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) { + if (shouldSwapOperandsForPredicate(FirstOp, SecondOp, I)) { std::swap(FirstOp, SecondOp); Predicate = CmpInst::getSwappedPredicate(Predicate); AdditionallyUsedValue = CmpOp1; @@ -3024,7 +3024,7 @@ void NewGVN::cleanupTables() { PredicateToUsers.clear(); MemoryToUsers.clear(); RevisitOnReachabilityChange.clear(); - IntrinsicInstPred.clear(); + PredicateSwapChoice.clear(); } // Assign local DFS number mapping to instructions, and leave space for Value @@ -4250,20 +4250,18 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); } -bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B, +bool NewGVN::shouldSwapOperandsForPredicate(const Value *A, const Value *B, const IntrinsicInst *I) const { - auto LookupResult = IntrinsicInstPred.find(I); if (shouldSwapOperands(A, B)) { - if (LookupResult == IntrinsicInstPred.end()) - IntrinsicInstPred.insert({I, B}); - else - LookupResult->second = B; + PredicateSwapChoice[I] = B; return true; } - if (LookupResult != IntrinsicInstPred.end()) { + auto LookupResult = PredicateSwapChoice.find(I); + if (LookupResult != PredicateSwapChoice.end()) { auto *SeenPredicate = LookupResult->second; if (SeenPredicate) { + // We previously decided to swap B to the left. Keep that choice. if (SeenPredicate == B) return true; else diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index ced61cb..aae5d60 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -458,8 +458,10 @@ bool ScalarizerVisitor::visit(Function &F) { Instruction *I = &*II; bool Done = InstVisitor::visit(I); ++II; - if (Done && I->getType()->isVoidTy()) + if (Done && I->getType()->isVoidTy()) { I->eraseFromParent(); + Scalarized = true; + } } } return finish(); diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index a69d649..44e63a0 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -128,6 +129,7 @@ struct PredInfo { using BBPredicates = DenseMap<BasicBlock *, PredInfo>; using PredMap = DenseMap<BasicBlock *, BBPredicates>; using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>; +using Val2BBMap = DenseMap<Value *, BasicBlock *>; // A traits type that is intended to be used in graph algorithms. The graph // traits starts at an entry node, and traverses the RegionNodes that are in @@ -279,7 +281,7 @@ class StructurizeCFG { ConstantInt *BoolTrue; ConstantInt *BoolFalse; Value *BoolPoison; - + const TargetTransformInfo *TTI; Function *Func; Region *ParentRegion; @@ -301,8 +303,12 @@ class StructurizeCFG { PredMap LoopPreds; BranchVector LoopConds; + Val2BBMap HoistedValues; + RegionNode *PrevNode; + void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB); + void orderNodes(); void analyzeLoops(RegionNode *N); @@ -332,6 +338,8 @@ class StructurizeCFG { void simplifyAffectedPhis(); + void simplifyHoistedPhis(); + DebugLoc killTerminator(BasicBlock *BB); void changeExit(RegionNode *Node, BasicBlock *NewExit, @@ -359,7 +367,7 @@ class StructurizeCFG { public: void init(Region *R); - bool run(Region *R, DominatorTree *DT); + bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI); bool makeUniformRegion(Region *R, UniformityInfo &UA); }; @@ -385,8 +393,11 @@ public: if (SCFG.makeUniformRegion(R, UA)) return false; } + Function *F = R->getEntry()->getParent(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F); DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - return SCFG.run(R, DT); + return SCFG.run(R, DT, TTI); } StringRef getPassName() const override { return "Structurize control flow"; } @@ -394,7 +405,9 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { if (SkipUniformRegions) AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); @@ -403,6 +416,34 @@ public: } // end anonymous namespace +/// Checks whether an instruction is zero cost instruction and checks if the +/// operands are from different BB. If so, this instruction can be coalesced +/// if its hoisted to predecessor block. So, this returns true. +static bool isHoistableInstruction(Instruction *I, BasicBlock *BB, + const TargetTransformInfo *TTI) { + if (I->getParent() != BB || isa<PHINode>(I)) + return false; + + // If the instruction is not a zero cost instruction, return false. + auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); + InstructionCost::CostType CostVal = + Cost.isValid() + ? Cost.getValue() + : (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive; + if (CostVal != 0) + return false; + + // Check if any operands are instructions defined in the same block. + for (auto &Op : I->operands()) { + if (auto *OpI = dyn_cast<Instruction>(Op)) { + if (OpI->getParent() == BB) + return false; + } + } + + return true; +} + char StructurizeCFGLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg", @@ -413,6 +454,39 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg", "Structurize the CFG", false, false) +/// Structurization can introduce unnecessary VGPR copies due to register +/// coalescing interference. For example, if the Else block has a zero-cost +/// instruction and the Then block modifies the VGPR value, only one value is +/// live at a time in merge block before structurization. After structurization, +/// the coalescer may incorrectly treat the Then value as live in the Else block +/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies. +/// +/// This function examines phi nodes whose incoming values are zero-cost +/// instructions in the Else block. It identifies such values that can be safely +/// hoisted and moves them to the nearest common dominator of Then and Else +/// blocks. A follow-up function after setting PhiNodes assigns the hoisted +/// value to poison phi nodes along the if→flow edge, aiding register coalescing +/// and minimizing unnecessary live ranges. +void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, + BasicBlock *ThenBB) { + + BasicBlock *ElseSucc = ElseBB->getSingleSuccessor(); + BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB); + + if (!ElseSucc || !CommonDominator) + return; + Instruction *Term = CommonDominator->getTerminator(); + for (PHINode &Phi : ElseSucc->phis()) { + Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB); + auto *Inst = dyn_cast<Instruction>(ElseVal); + if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI)) + continue; + Inst->removeFromParent(); + Inst->insertInto(CommonDominator, Term->getIterator()); + HoistedValues[Inst] = CommonDominator; + } +} + /// Build up the general order of nodes, by performing a topological sort of the /// parent region's nodes, while ensuring that there is no outer cycle node /// between any two inner cycle nodes. @@ -535,7 +609,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { BasicBlock *Other = Term->getSuccessor(!i); if (Visited.count(Other) && !Loops.count(Other) && !Pred.count(Other) && !Pred.count(P)) { - + hoistZeroCostElseBlockPhiValues(Succ, Other); Pred[Other] = {BoolFalse, std::nullopt}; Pred[P] = {BoolTrue, std::nullopt}; continue; @@ -891,6 +965,44 @@ void StructurizeCFG::setPhiValues() { AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end()); } +/// Updates PHI nodes after hoisted zero cost instructions by replacing poison +/// entries on Flow nodes with the appropriate hoisted values +void StructurizeCFG::simplifyHoistedPhis() { + for (WeakVH VH : AffectedPhis) { + PHINode *Phi = dyn_cast_or_null<PHINode>(VH); + if (!Phi || Phi->getNumIncomingValues() != 2) + continue; + + for (int i = 0; i < 2; i++) { + Value *V = Phi->getIncomingValue(i); + auto BBIt = HoistedValues.find(V); + + if (BBIt == HoistedValues.end()) + continue; + + Value *OtherV = Phi->getIncomingValue(!i); + PHINode *OtherPhi = dyn_cast<PHINode>(OtherV); + if (!OtherPhi) + continue; + + int PoisonValBBIdx = -1; + for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) { + if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i))) + continue; + PoisonValBBIdx = i; + break; + } + if (PoisonValBBIdx == -1 || + !DT->dominates(BBIt->second, + OtherPhi->getIncomingBlock(PoisonValBBIdx))) + continue; + + OtherPhi->setIncomingValue(PoisonValBBIdx, V); + Phi->setIncomingValue(i, OtherV); + } + } +} + void StructurizeCFG::simplifyAffectedPhis() { bool Changed; do { @@ -1283,12 +1395,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { } /// Run the transformation for each region found -bool StructurizeCFG::run(Region *R, DominatorTree *DT) { +bool StructurizeCFG::run(Region *R, DominatorTree *DT, + const TargetTransformInfo *TTI) { if (R->isTopLevelRegion()) return false; this->DT = DT; - + this->TTI = TTI; Func = R->getEntry()->getParent(); assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); @@ -1300,6 +1413,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) { insertConditions(false); insertConditions(true); setPhiValues(); + simplifyHoistedPhis(); simplifyConditions(); simplifyAffectedPhis(); rebuildSSA(); @@ -1349,7 +1463,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F, bool Changed = false; DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); auto &RI = AM.getResult<RegionInfoAnalysis>(F); - + TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F); UniformityInfo *UI = nullptr; if (SkipUniformRegions) UI = &AM.getResult<UniformityInfoAnalysis>(F); @@ -1368,7 +1482,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F, continue; } - Changed |= SCFG.run(R, DT); + Changed |= SCFG.run(R, DT, TTI); } if (!Changed) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp index 9fe655e..fca09c6 100644 --- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp +++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp @@ -498,7 +498,7 @@ bool LibCallsShrinkWrap::perform(CallInst *CI) { static bool runImpl(Function &F, const TargetLibraryInfo &TLI, DominatorTree *DT) { - if (F.hasFnAttribute(Attribute::OptimizeForSize)) + if (F.hasOptSize()) return false; DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); LibCallsShrinkWrap CCDCE(TLI, DTU); diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index b9292af..b78c702 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -703,6 +703,7 @@ private: // Add U as additional user of V. void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); } + void handlePredicate(Instruction *I, Value *CopyOf, const PredicateBase *PI); void handleCallOverdefined(CallBase &CB); void handleCallResult(CallBase &CB); void handleCallArguments(CallBase &CB); @@ -1927,6 +1928,75 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) { } } +void SCCPInstVisitor::handlePredicate(Instruction *I, Value *CopyOf, + const PredicateBase *PI) { + ValueLatticeElement CopyOfVal = getValueState(CopyOf); + const std::optional<PredicateConstraint> &Constraint = PI->getConstraint(); + if (!Constraint) { + mergeInValue(ValueState[I], I, CopyOfVal); + return; + } + + CmpInst::Predicate Pred = Constraint->Predicate; + Value *OtherOp = Constraint->OtherOp; + + // Wait until OtherOp is resolved. + if (getValueState(OtherOp).isUnknown()) { + addAdditionalUser(OtherOp, I); + return; + } + + ValueLatticeElement CondVal = getValueState(OtherOp); + ValueLatticeElement &IV = ValueState[I]; + if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) { + auto ImposedCR = + ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType())); + + // Get the range imposed by the condition. + if (CondVal.isConstantRange()) + ImposedCR = ConstantRange::makeAllowedICmpRegion( + Pred, CondVal.getConstantRange()); + + // Combine range info for the original value with the new range from the + // condition. + auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(), + /*UndefAllowed=*/true); + // Treat an unresolved input like a full range. + if (CopyOfCR.isEmptySet()) + CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth()); + auto NewCR = ImposedCR.intersectWith(CopyOfCR); + // If the existing information is != x, do not use the information from + // a chained predicate, as the != x information is more likely to be + // helpful in practice. + if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement()) + NewCR = CopyOfCR; + + // The new range is based on a branch condition. That guarantees that + // neither of the compare operands can be undef in the branch targets, + // unless we have conditions that are always true/false (e.g. icmp ule + // i32, %a, i32_max). For the latter overdefined/empty range will be + // inferred, but the branch will get folded accordingly anyways. + addAdditionalUser(OtherOp, I); + mergeInValue( + IV, I, ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false)); + return; + } else if (Pred == CmpInst::ICMP_EQ && + (CondVal.isConstant() || CondVal.isNotConstant())) { + // For non-integer values or integer constant expressions, only + // propagate equal constants or not-constants. + addAdditionalUser(OtherOp, I); + mergeInValue(IV, I, CondVal); + return; + } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) { + // Propagate inequalities. + addAdditionalUser(OtherOp, I); + mergeInValue(IV, I, ValueLatticeElement::getNot(CondVal.getConstant())); + return; + } + + return (void)mergeInValue(IV, I, CopyOfVal); +} + void SCCPInstVisitor::handleCallResult(CallBase &CB) { Function *F = CB.getCalledFunction(); @@ -1936,77 +2006,10 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return; Value *CopyOf = CB.getOperand(0); - ValueLatticeElement CopyOfVal = getValueState(CopyOf); - const auto *PI = getPredicateInfoFor(&CB); + const PredicateBase *PI = getPredicateInfoFor(&CB); assert(PI && "Missing predicate info for ssa.copy"); - - const std::optional<PredicateConstraint> &Constraint = - PI->getConstraint(); - if (!Constraint) { - mergeInValue(ValueState[&CB], &CB, CopyOfVal); - return; - } - - CmpInst::Predicate Pred = Constraint->Predicate; - Value *OtherOp = Constraint->OtherOp; - - // Wait until OtherOp is resolved. - if (getValueState(OtherOp).isUnknown()) { - addAdditionalUser(OtherOp, &CB); - return; - } - - ValueLatticeElement CondVal = getValueState(OtherOp); - ValueLatticeElement &IV = ValueState[&CB]; - if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) { - auto ImposedCR = - ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType())); - - // Get the range imposed by the condition. - if (CondVal.isConstantRange()) - ImposedCR = ConstantRange::makeAllowedICmpRegion( - Pred, CondVal.getConstantRange()); - - // Combine range info for the original value with the new range from the - // condition. - auto CopyOfCR = CopyOfVal.asConstantRange(CopyOf->getType(), - /*UndefAllowed=*/true); - // Treat an unresolved input like a full range. - if (CopyOfCR.isEmptySet()) - CopyOfCR = ConstantRange::getFull(CopyOfCR.getBitWidth()); - auto NewCR = ImposedCR.intersectWith(CopyOfCR); - // If the existing information is != x, do not use the information from - // a chained predicate, as the != x information is more likely to be - // helpful in practice. - if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement()) - NewCR = CopyOfCR; - - // The new range is based on a branch condition. That guarantees that - // neither of the compare operands can be undef in the branch targets, - // unless we have conditions that are always true/false (e.g. icmp ule - // i32, %a, i32_max). For the latter overdefined/empty range will be - // inferred, but the branch will get folded accordingly anyways. - addAdditionalUser(OtherOp, &CB); - mergeInValue( - IV, &CB, - ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef*/ false)); - return; - } else if (Pred == CmpInst::ICMP_EQ && - (CondVal.isConstant() || CondVal.isNotConstant())) { - // For non-integer values or integer constant expressions, only - // propagate equal constants or not-constants. - addAdditionalUser(OtherOp, &CB); - mergeInValue(IV, &CB, CondVal); - return; - } else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant()) { - // Propagate inequalities. - addAdditionalUser(OtherOp, &CB); - mergeInValue(IV, &CB, - ValueLatticeElement::getNot(CondVal.getConstant())); - return; - } - - return (void)mergeInValue(IV, &CB, CopyOfVal); + handlePredicate(&CB, CopyOf, PI); + return; } if (II->getIntrinsicID() == Intrinsic::vscale) { diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index ed08c0b..571fa11 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" @@ -42,6 +43,7 @@ cl::opt<unsigned> llvm::SCEVCheapExpansionBudget( "controls the budget that is considered cheap (default = 4)")); using namespace PatternMatch; +using namespace SCEVPatternMatch; PoisonFlags::PoisonFlags(const Instruction *I) { NUW = false; @@ -1224,6 +1226,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { } Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { + Type *STy = S->getType(); const Loop *L = S->getLoop(); BasicBlock *EB = L->getExitBlock(); if (!EB || !EB->getSinglePredecessor() || @@ -1231,11 +1234,36 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { return nullptr; for (auto &PN : EB->phis()) { - if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType()) + if (!SE.isSCEVable(PN.getType())) continue; - auto *ExitV = SE.getSCEV(&PN); - if (S == ExitV) - return &PN; + auto *ExitSCEV = SE.getSCEV(&PN); + if (!isa<SCEVAddRecExpr>(ExitSCEV)) + continue; + Type *PhiTy = PN.getType(); + if (STy->isIntegerTy() && PhiTy->isPointerTy()) + ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy); + else if (S->getType() != PN.getType()) + continue; + + // Check if we can re-use the existing PN, by adjusting it with an expanded + // offset, if the offset is simpler. + const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV); + const SCEV *Op = Diff; + match(Diff, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op))); + match(Op, m_scev_PtrToInt(m_SCEV(Op))); + if (!isa<SCEVConstant, SCEVUnknown>(Op)) + continue; + + assert(Diff->getType()->isIntegerTy() && + "difference must be of integer type"); + Value *DiffV = expand(Diff); + Value *BaseV = fixupLCSSAFormFor(&PN); + if (PhiTy->isPointerTy()) { + if (STy->isPointerTy()) + return Builder.CreatePtrAdd(BaseV, DiffV); + BaseV = Builder.CreatePtrToInt(BaseV, DiffV->getType()); + } + return Builder.CreateAdd(BaseV, DiffV); } return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index f57ce0c..ea0fa06 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -170,8 +170,7 @@ public: new VPInstruction(Opcode, Operands, Flags, DL, Name)); } - VPInstruction *createNaryOp(unsigned Opcode, - std::initializer_list<VPValue *> Operands, + VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, Type *ResultTy, const VPIRFlags &Flags = {}, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { @@ -180,7 +179,7 @@ public: } VPInstruction *createOverflowingOp(unsigned Opcode, - std::initializer_list<VPValue *> Operands, + ArrayRef<VPValue *> Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 99a96a8..850c4a1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -93,6 +93,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -155,6 +156,7 @@ #include <utility> using namespace llvm; +using namespace SCEVPatternMatch; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// ElementCount to include loops whose trip count is a function of vscale. static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L) { - return ElementCount::getFixed(SE->getSmallConstantTripCount(L)); + if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L)) + return ElementCount::getFixed(ExpectedTC); + + const SCEV *BTC = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BTC)) + return ElementCount::getFixed(0); + + const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L); + if (isa<SCEVVScale>(ExitCount)) + return ElementCount::getScalable(1); + + const APInt *Scale; + if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale()))) + if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap()) + if (Scale->getActiveBits() <= 32) + return ElementCount::getScalable(Scale->getZExtValue()); + + return ElementCount::getFixed(0); } /// Returns "best known" trip count, which is either a valid positive trip count @@ -1363,11 +1382,15 @@ public: TTI.hasActiveVectorLength() && !EnableVPlanNativePath; if (EVLIsLegal) return; - // If for some reason EVL mode is unsupported, fallback to - // DataWithoutLaneMask to try to vectorize the loop with folded tail - // in a generic way. - ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask, - TailFoldingStyle::DataWithoutLaneMask}; + // If for some reason EVL mode is unsupported, fallback to a scalar epilogue + // if it's allowed, or DataWithoutLaneMask otherwise. + if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) + ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None}; + else + ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask, + TailFoldingStyle::DataWithoutLaneMask}; + LLVM_DEBUG( dbgs() << "LV: Preference for VP intrinsics indicated. Will " "not try to generate VP Intrinsics " @@ -2021,6 +2044,9 @@ public: /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR /// outside VPlan. std::pair<Value *, BasicBlock *> getMemRuntimeChecks() { + using namespace llvm::PatternMatch; + if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt())) + return {nullptr, nullptr}; return {MemRuntimeCheckCond, MemCheckBlock}; } @@ -2586,12 +2612,12 @@ static void cse(BasicBlock *BB) { } } -/// This function attempts to return a value that represents the vectorization -/// factor at runtime. For fixed-width VFs we know this precisely at compile +/// This function attempts to return a value that represents the ElementCount +/// at runtime. For fixed-width VFs we know this precisely at compile /// time, but for scalable VFs we calculate it based on an estimate of the /// vscale value. -static unsigned getEstimatedRuntimeVF(ElementCount VF, - std::optional<unsigned> VScale) { +static unsigned estimateElementCount(ElementCount VF, + std::optional<unsigned> VScale) { unsigned EstimatedVF = VF.getKnownMinValue(); if (VF.isScalable()) if (VScale) @@ -2701,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // use the value of vscale used for tuning. Loop *VectorLoop = LI->getLoopFor(HeaderBB); unsigned EstimatedVFxUF = - getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning()); + estimateElementCount(VF * UF, Cost->getVScaleForTuning()); setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); } @@ -2996,7 +3022,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { // is correct. The easiest form of the later is to require that all values // stored are the same. return !(Legal->isInvariant(getLoadStorePointerOperand(I)) && - Legal->isInvariant(cast<StoreInst>(I)->getValueOperand())); + TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand())); } case Instruction::UDiv: case Instruction::SDiv: @@ -3124,7 +3150,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( isa<LoadInst>(I) && Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); bool StoreAccessWithGapsRequiresMasking = - isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); + isa<StoreInst>(I) && !Group->isFull(); if (!PredicatedAccessRequiresMasking && !LoadAccessWithGapsRequiresEpilogMasking && !StoreAccessWithGapsRequiresMasking) @@ -4330,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); unsigned Width = - getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning()); + estimateElementCount(Candidate.Width, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF << " costs: " << (Candidate.Cost / Width)); if (VF.isScalable()) @@ -4438,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 ? EpilogueVectorizationMinVF : TTI.getEpilogueVectorizationMinVF(); - return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >= + return estimateElementCount(VF * Multiplier, VScaleForTuning) >= MinVFThreshold; } @@ -4491,25 +4517,23 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. ElementCount EstimatedRuntimeVF = ElementCount::getFixed( - getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning())); + estimateElementCount(MainLoopVF, CM.getVScaleForTuning())); ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; unsigned MaxTripCount = 0; - if (MainLoopVF.isFixed()) { - // TODO: extend to support scalable VFs. - const SCEV *TC = vputils::getSCEVExprForVPValue( - getPlanFor(MainLoopVF).getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TC) && - "Trip count SCEV must be computable"); - RemainingIterations = SE.getURemExpr( - TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); - - // No iterations left to process in the epilogue. - if (RemainingIterations->isZero()) - return Result; + const SCEV *TC = + vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable"); + RemainingIterations = + SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); + + // No iterations left to process in the epilogue. + if (RemainingIterations->isZero()) + return Result; + if (MainLoopVF.isFixed()) { MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, SE.getConstant(TCType, MaxTripCount))) { @@ -4740,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning); - // Try to get the exact trip count, or an estimate based on profiling data or // ConstantMax from PSE, failing that. - if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) { + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); + + // For fixed length VFs treat a scalable trip count as unknown. + if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) { + // Re-evaluate trip counts and VFs to be in the same numerical space. + unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning); + unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning); + // At least one iteration must be scalar when this constraint holds. So the // maximum available iterations for interleaving is one less. - unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) - ? BestKnownTC->getFixedValue() - 1 - : BestKnownTC->getFixedValue(); + if (requiresScalarEpilogue(VF.isVector())) + --AvailableTC; unsigned InterleaveCountLB = bit_floor(std::max( 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); @@ -5344,7 +5372,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || - (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); + (isa<StoreInst>(I) && !Group->isFull()); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), @@ -6920,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, // Now compute and add the VPlan-based cost. Cost += Plan.cost(VF, CostCtx); #ifndef NDEBUG - unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning()); + unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << " (Estimated cost per lane: "); if (Cost.isValid()) { @@ -7276,6 +7304,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); + VPlanTransforms::removeBranchOnConst(BestVPlan); VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); @@ -7286,6 +7315,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( // Regions are dissolved after optimizing for VF and UF, which completely // removes unneeded loop regions first. VPlanTransforms::dissolveLoopRegions(BestVPlan); + // Canonicalize EVL loops after regions are dissolved. + VPlanTransforms::canonicalizeEVLLoops(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, OrigLoop->getParentLoop(), @@ -9605,7 +9636,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. - unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); + unsigned IntVF = estimateElementCount(VF.Width, VScale); uint64_t RtC = TotalCost.getValue(); uint64_t Div = ScalarC * IntVF - VF.Cost.getValue(); uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); @@ -10072,12 +10103,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); - if (LVL.hasUncountableEarlyExit() && UserIC != 1) { - UserIC = 1; - reportVectorizationInfo("Interleaving not supported for loops " - "with uncountable early exits", - "InterleaveEarlyExitDisabled", ORE, L); - } // Plan how to best vectorize. LVP.plan(UserVF, UserIC); @@ -10095,9 +10120,20 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) + if (VF.Width.isVector() || SelectedIC > 1) { Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + // Bail out early if either the SCEV or memory runtime checks are known to + // fail. In that case, the vector loop would never execute. + using namespace llvm::PatternMatch; + if (Checks.getSCEVChecks().first && + match(Checks.getSCEVChecks().first, m_One())) + return false; + if (Checks.getMemRuntimeChecks().first && + match(Checks.getMemRuntimeChecks().first, m_One())) + return false; + } + // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; @@ -10228,6 +10264,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan); + // TODO: Move to general VPlan pipeline once epilogue loops are also + // supported. + VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount, + BestPlan, VF.Width, IC, PSE); + LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { @@ -10295,6 +10336,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.MinProfitableTripCount, IC, &CM, BFI, PSI, Checks, BestPlan); + // TODO: Move to general VPlan pipeline once epilogue loops are also + // supported. + VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount, + BestPlan, VF.Width, IC, PSE); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0adad5a..593868f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6861,7 +6861,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::move(ResOrder); } if (TE.State == TreeEntry::StridedVectorize && !TopToBottom && - (!TE.UserTreeIndex || + (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() || !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) && (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) return std::nullopt; @@ -7175,7 +7175,8 @@ bool BoUpSLP::isProfitableToReorder() const { // other nodes are phis or geps/binops, combined with phis, and/or single // gather load node bool HasPhis = false; - if (VectorizableTree.front()->getOpcode() == Instruction::PHI && + if (VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::PHI && VectorizableTree.front()->Scalars.size() == TinyVF && VectorizableTree.front()->getNumOperands() > PhiOpsLimit) return false; @@ -7999,7 +8000,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const { - if ((Entry.getOpcode() == Instruction::Store || + if (Entry.hasState() && + (Entry.getOpcode() == Instruction::Store || Entry.getOpcode() == Instruction::Load) && Entry.State == TreeEntry::StridedVectorize && !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices)) @@ -10231,6 +10233,15 @@ public: count_if(VL, [&](Value *V) { return S.isCopyableElement(V); }); if (CopyableNum < VL.size() / 2) return S; + // Too many phi copyables - exit. + const unsigned Limit = VL.size() / 24; + if ((CopyableNum >= VL.size() - Limit || + (CopyableNum >= VL.size() - 1 && VL.size() > 4) || + CopyableNum >= MaxPHINumOperands) && + all_of(VL, [&](Value *V) { + return isa<PHINode>(V) || !S.isCopyableElement(V); + })) + return InstructionsState::invalid(); // Check profitability if number of copyables > VL.size() / 2. // 1. Reorder operands for better matching. if (isCommutative(MainOp)) { @@ -14483,7 +14494,8 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { // If the tree contains only phis, buildvectors, split nodes and // small nodes with reuses, we can skip it. - unsigned SingleStoreLoadNode = 0; + SmallVector<const TreeEntry *> StoreLoadNodes; + unsigned NumGathers = 0; constexpr int LimitTreeSize = 36; if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && all_of(VectorizableTree, @@ -14491,9 +14503,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { if (!TE->isGather() && TE->hasState() && (TE->getOpcode() == Instruction::Load || TE->getOpcode() == Instruction::Store)) { - ++SingleStoreLoadNode; + StoreLoadNodes.push_back(TE.get()); return true; } + if (TE->isGather()) + ++NumGathers; return TE->State == TreeEntry::SplitVectorize || (TE->Idx == 0 && TE->Scalars.size() == 2 && TE->hasState() && TE->getOpcode() == Instruction::ICmp && @@ -14510,8 +14524,15 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { !TE->ReorderIndices.empty() || TE->isAltShuffle()) && TE->Scalars.size() == 2))); }) && - (!SingleStoreLoadNode || - VectorizableTree.size() > LimitTreeSize * SingleStoreLoadNode)) + (StoreLoadNodes.empty() || + (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() && + (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) { + return TE->getOpcode() == Instruction::Store || + all_of(TE->Scalars, [&](Value *V) { + return !isa<LoadInst>(V) || + areAllUsersVectorized(cast<Instruction>(V)); + }); + }))))) return true; // We can vectorize the tree if its size is greater than or equal to the @@ -15254,6 +15275,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals, bool IsProfitablePHIUser = (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic && VectorizableTree.front()->Scalars.size() > 2)) && + VectorizableTree.front()->hasState() && VectorizableTree.front()->getOpcode() == Instruction::PHI && !Inst->hasNUsesOrMore(UsesLimit) && none_of(Inst->users(), @@ -15704,7 +15726,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( const BasicBlock *TEInsertBlock = nullptr; // Main node of PHI entries keeps the correct order of operands/incoming // blocks. - if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp()); + if (auto *PHI = dyn_cast_or_null<PHINode>( + TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr); PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) { TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx); TEInsertPt = TEInsertBlock->getTerminator(); @@ -15803,7 +15826,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( "Expected only single user of a gather node."); const EdgeInfo &UseEI = TEPtr->UserTreeIndex; - PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize + PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize && + UseEI.UserTE->hasState()) ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp()) : nullptr; Instruction *InsertPt = @@ -15816,7 +15840,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( TEUseEI.UserTE->isAltShuffle()) && all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) { if (UseEI.UserTE->State != TreeEntry::Vectorize || - (UseEI.UserTE->getOpcode() == Instruction::PHI && + (UseEI.UserTE->hasState() && + UseEI.UserTE->getOpcode() == Instruction::PHI && !UseEI.UserTE->isAltShuffle()) || !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock)) continue; @@ -16438,24 +16463,31 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block (except for extractelement-like instructions with // constant indices or gathered loads or copyables). - auto *Front = E->getMainOp(); + Instruction *Front; + unsigned Opcode; + if (E->hasState()) { + Front = E->getMainOp(); + Opcode = E->getOpcode(); + } else { + Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>)); + Opcode = Front->getOpcode(); + } auto *BB = Front->getParent(); - assert(((GatheredLoadsEntriesFirst.has_value() && - E->getOpcode() == Instruction::Load && E->isGather() && - E->Idx < *GatheredLoadsEntriesFirst) || - E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() || - all_of(E->Scalars, - [=](Value *V) -> bool { - if (E->getOpcode() == Instruction::GetElementPtr && - !isa<GetElementPtrInst>(V)) - return true; - auto *I = dyn_cast<Instruction>(V); - return !I || !E->getMatchingMainOpOrAltOp(I) || - I->getParent() == BB || - isVectorLikeInstWithConstOps(I); - })) && - "Expected gathered loads or GEPs or instructions from same basic " - "block."); + assert( + ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load && + E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) || + E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() || + all_of(E->Scalars, + [=](Value *V) -> bool { + if (Opcode == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(V)) + return true; + auto *I = dyn_cast<Instruction>(V); + return !I || !E->getMatchingMainOpOrAltOp(I) || + I->getParent() == BB || isVectorLikeInstWithConstOps(I); + })) && + "Expected gathered loads or GEPs or instructions from same basic " + "block."); auto FindLastInst = [&]() { Instruction *LastInst = Front; @@ -16470,13 +16502,13 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { LastInst = I; continue; } - assert(((E->getOpcode() == Instruction::GetElementPtr && + assert(((Opcode == Instruction::GetElementPtr && !isa<GetElementPtrInst>(I)) || E->State == TreeEntry::SplitVectorize || (isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I)) || (GatheredLoadsEntriesFirst.has_value() && - E->getOpcode() == Instruction::Load && E->isGather() && + Opcode == Instruction::Load && E->isGather() && E->Idx < *GatheredLoadsEntriesFirst)) && "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(LastInst->getParent())) { @@ -16512,11 +16544,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { FirstInst = I; continue; } - assert(((E->getOpcode() == Instruction::GetElementPtr && - !isa<GetElementPtrInst>(I)) || - (isVectorLikeInstWithConstOps(FirstInst) && - isVectorLikeInstWithConstOps(I))) && - "Expected vector-like or non-GEP in GEP node insts only."); + assert(((Opcode == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(I)) || + (isVectorLikeInstWithConstOps(FirstInst) && + isVectorLikeInstWithConstOps(I))) && + "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(FirstInst->getParent())) { FirstInst = I; continue; @@ -16554,7 +16586,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // Set insertpoint for gathered loads to the very first load. if (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && - E->getOpcode() == Instruction::Load) { + Opcode == Instruction::Load) { Res = FindFirstInst(); EntryToLastInstruction.try_emplace(E, Res); return *Res; @@ -16586,7 +16618,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { }; const ScheduleBundle *Bundle = FindScheduleBundle(E); if (!E->isGather() && !Bundle) { - if ((E->getOpcode() == Instruction::GetElementPtr && + if ((Opcode == Instruction::GetElementPtr && any_of(E->Scalars, [](Value *V) { return !isa<GetElementPtrInst>(V) && isa<Instruction>(V); @@ -21001,9 +21033,10 @@ void BoUpSLP::computeMinimumValueSizes() { if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode, SelectInst>(U) || isa<SIToFPInst, UIToFPInst>(U) || - !isa<CastInst, BinaryOperator, FreezeInst, PHINode, - SelectInst>(UserTE->getMainOp()) || - isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp())) + (UserTE->hasState() && + (!isa<CastInst, BinaryOperator, FreezeInst, PHINode, + SelectInst>(UserTE->getMainOp()) || + isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp())))) return true; unsigned UserTESz = DL->getTypeSizeInBits( UserTE->Scalars.front()->getType()); @@ -21253,6 +21286,7 @@ void BoUpSLP::computeMinimumValueSizes() { NodeIdx < VectorizableTree.size() && VectorizableTree[NodeIdx]->UserTreeIndex && VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 && + VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() && VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() == Instruction::Trunc && !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 40a5565..25b9616 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -962,7 +962,11 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, BackedgeTakenCount->setUnderlyingValue(TCMO); } - VectorTripCount.setUnderlyingValue(VectorTripCountV); + if (!VectorTripCount.getUnderlyingValue()) + VectorTripCount.setUnderlyingValue(VectorTripCountV); + else + assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV && + "VectorTripCount set earlier must much VectorTripCountV"); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 99fd97e..a5de593 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -906,10 +906,10 @@ template <unsigned PartOpIdx> class LLVM_ABI_FOR_TEST VPUnrollPartAccessor { protected: /// Return the VPValue operand containing the unroll part or null if there is /// no such operand. - VPValue *getUnrollPartOperand(VPUser &U) const; + VPValue *getUnrollPartOperand(const VPUser &U) const; /// Return the unroll part. - unsigned getUnrollPart(VPUser &U) const; + unsigned getUnrollPart(const VPUser &U) const; }; /// Helper to manage IR metadata for recipes. It filters out metadata that @@ -1012,6 +1012,10 @@ public: ReductionStartVector, // Creates a step vector starting from 0 to VF with a step of 1. StepVector, + /// Extracts a single lane (first operand) from a set of vector operands. + /// The lane specifies an index into a vector formed by combining all vector + /// operands (all operands after the first one). + ExtractLane, }; @@ -1662,6 +1666,8 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, VPSlotTracker &SlotTracker) const override; #endif + unsigned getOpcode() const { return Instruction::Select; } + VPValue *getCond() const { return getOperand(0); } @@ -1835,6 +1841,10 @@ public: getGEPNoWrapFlags(), getDebugLoc()); } + /// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that + /// this is only accurate after the VPlan has been unrolled. + bool isFirstPart() const { return getUnrollPart(*this) == 0; } + /// Return the cost of this VPHeaderPHIRecipe. InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override { @@ -2302,14 +2312,15 @@ public: /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can /// be omitted (implied by passing an odd number of operands) in which case /// all other incoming values are merged into it. - VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) - : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) { + VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands, DebugLoc DL) + : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, DL) { assert(Operands.size() > 0 && "Expected at least one operand!"); } VPBlendRecipe *clone() override { SmallVector<VPValue *> Ops(operands()); - return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops); + return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops, + getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPBlendSC) @@ -3484,7 +3495,7 @@ public: /// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that /// this is only accurate after the VPlan has been unrolled. - bool isPart0() { return getUnrollPart(*this) == 0; } + bool isPart0() const { return getUnrollPart(*this) == 0; } VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC) @@ -4054,6 +4065,10 @@ public: /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } + LLVMContext &getContext() const { + return getScalarHeader()->getIRBasicBlock()->getContext(); + } + void addVF(ElementCount VF) { VFs.insert(VF); } void setVF(ElementCount VF) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 3499e65..16072f2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -110,6 +110,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: return SetResultTyFromOp(); + case VPInstruction::ExtractLane: + return inferScalarType(R->getOperand(1)); case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 194874a..6c1f53b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -437,9 +437,12 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, // We are about to replace the branch to exit the region. Remove the original // BranchOnCond, if there is any. + DebugLoc LatchDL = DL; if (!LatchVPBB->empty() && - match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) + match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) { + LatchDL = LatchVPBB->getTerminator()->getDebugLoc(); LatchVPBB->getTerminator()->eraseFromParent(); + } VPBuilder Builder(LatchVPBB); // Add a VPInstruction to increment the scalar canonical IV by VF * UF. @@ -452,7 +455,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, // Add the BranchOnCount VPInstruction to the latch. Builder.createNaryOp(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, + LatchDL); } void VPlanTransforms::prepareForVectorization( @@ -462,28 +466,27 @@ void VPlanTransforms::prepareForVectorization( VPDominatorTree VPDT; VPDT.recalculate(Plan); - VPBlockBase *HeaderVPB = Plan.getEntry()->getSingleSuccessor(); - canonicalHeaderAndLatch(HeaderVPB, VPDT); - VPBlockBase *LatchVPB = HeaderVPB->getPredecessors()[1]; + auto *HeaderVPBB = cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor()); + canonicalHeaderAndLatch(HeaderVPBB, VPDT); + auto *LatchVPBB = cast<VPBasicBlock>(HeaderVPBB->getPredecessors()[1]); VPBasicBlock *VecPreheader = Plan.createVPBasicBlock("vector.ph"); VPBlockUtils::insertBlockAfter(VecPreheader, Plan.getEntry()); VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock("middle.block"); - // The canonical LatchVPB has the header block as last successor. If it has + // The canonical LatchVPBB has the header block as last successor. If it has // another successor, this successor is an exit block - insert middle block on // its edge. Otherwise, add middle block as another successor retaining header // as last. - if (LatchVPB->getNumSuccessors() == 2) { - VPBlockBase *LatchExitVPB = LatchVPB->getSuccessors()[0]; - VPBlockUtils::insertOnEdge(LatchVPB, LatchExitVPB, MiddleVPBB); + if (LatchVPBB->getNumSuccessors() == 2) { + VPBlockBase *LatchExitVPB = LatchVPBB->getSuccessors()[0]; + VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, MiddleVPBB); } else { - VPBlockUtils::connectBlocks(LatchVPB, MiddleVPBB); - LatchVPB->swapSuccessors(); + VPBlockUtils::connectBlocks(LatchVPBB, MiddleVPBB); + LatchVPBB->swapSuccessors(); } - addCanonicalIVRecipes(Plan, cast<VPBasicBlock>(HeaderVPB), - cast<VPBasicBlock>(LatchVPB), InductionTy, IVDL); + addCanonicalIVRecipes(Plan, HeaderVPBB, LatchVPBB, InductionTy, IVDL); [[maybe_unused]] bool HandledUncountableEarlyExit = false; // Disconnect all early exits from the loop leaving it with a single exit from @@ -499,8 +502,7 @@ void VPlanTransforms::prepareForVectorization( assert(!HandledUncountableEarlyExit && "can handle exactly one uncountable early exit"); handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan, - cast<VPBasicBlock>(HeaderVPB), - cast<VPBasicBlock>(LatchVPB), Range); + HeaderVPBB, LatchVPBB, Range); HandledUncountableEarlyExit = true; } else { for (VPRecipeBase &R : EB->phis()) @@ -564,15 +566,15 @@ void VPlanTransforms::prepareForVectorization( // the corresponding compare because they may have ended up with different // line numbers and we want to avoid awkward line stepping while debugging. // E.g., if the compare has got a line number inside the loop. - DebugLoc LatchDL = TheLoop->getLoopLatch()->getTerminator()->getDebugLoc(); + DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc(); VPBuilder Builder(MiddleVPBB); VPValue *Cmp; if (!RequiresScalarEpilogueCheck) - Cmp = Plan.getOrAddLiveIn(ConstantInt::getFalse( - IntegerType::getInt1Ty(TripCount->getType()->getContext()))); + Cmp = Plan.getOrAddLiveIn( + ConstantInt::getFalse(IntegerType::getInt1Ty(Plan.getContext()))); else if (TailFolded) - Cmp = Plan.getOrAddLiveIn(ConstantInt::getTrue( - IntegerType::getInt1Ty(TripCount->getType()->getContext()))); + Cmp = Plan.getOrAddLiveIn( + ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); else Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(), &Plan.getVectorTripCount(), LatchDL, "cmp.n"); @@ -646,7 +648,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, .createNaryOp(VPInstruction::BranchOnCond, {CondVPV}, Plan.getCanonicalIV()->getDebugLoc()); if (AddBranchWeights) { - MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext()); + MDBuilder MDB(Plan.getContext()); MDNode *BranchWeights = MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false); Term->addMetadata(LLVMContext::MD_prof, BranchWeights); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index f0cab79..3b3bbc3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -184,8 +184,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { VPValue *Cond = SI->getOperand(0); VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]); MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares; - for (const auto &[Idx, Succ] : - enumerate(ArrayRef(Src->getSuccessors()).drop_front())) { + for (const auto &[Idx, Succ] : enumerate(drop_begin(Src->getSuccessors()))) { VPBasicBlock *Dst = cast<VPBasicBlock>(Succ); assert(!getEdgeMask(Src, Dst) && "Edge masks already created"); // Cases whose destination is the same as default are redundant and can @@ -206,7 +205,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { // cases with destination == Dst are taken. Join the conditions for each // case whose destination == Dst using an OR. VPValue *Mask = Conds[0]; - for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front()) + for (VPValue *V : drop_begin(Conds)) Mask = Builder.createOr(Mask, V); if (SrcMask) Mask = Builder.createLogicalAnd(SrcMask, Mask); @@ -252,8 +251,9 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { } OperandsWithMask.push_back(EdgeMask); } - PHINode *IRPhi = cast<PHINode>(PhiR->getUnderlyingValue()); - auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask); + PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue()); + auto *Blend = + new VPBlendRecipe(IRPhi, OperandsWithMask, PhiR->getDebugLoc()); Builder.insert(Blend); PhiR->replaceAllUsesWith(Blend); PhiR->eraseFromParent(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b2066ce..68e7c20 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -413,20 +413,21 @@ void VPSingleDefRecipe::dump() const { VPDef::dump(); } template <unsigned PartOpIdx> VPValue * -VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { +VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const { if (U.getNumOperands() == PartOpIdx + 1) return U.getOperand(PartOpIdx); return nullptr; } template <unsigned PartOpIdx> -unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { +unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const { if (auto *UnrollPartOp = getUnrollPartOperand(U)) return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); return 0; } namespace llvm { +template class VPUnrollPartAccessor<1>; template class VPUnrollPartAccessor<2>; template class VPUnrollPartAccessor<3>; } @@ -863,6 +864,31 @@ Value *VPInstruction::generate(VPTransformState &State) { Res = Builder.CreateOr(Res, State.get(Op)); return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } + case VPInstruction::ExtractLane: { + Value *LaneToExtract = State.get(getOperand(0), true); + Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0)); + Value *Res = nullptr; + Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF); + + for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) { + Value *VectorStart = + Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1)); + Value *VectorIdx = Idx == 1 + ? LaneToExtract + : Builder.CreateSub(LaneToExtract, VectorStart); + Value *Ext = State.VF.isScalar() + ? State.get(getOperand(Idx)) + : Builder.CreateExtractElement( + State.get(getOperand(Idx)), VectorIdx); + if (Res) { + Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart); + Res = Builder.CreateSelect(Cmp, Ext, Res); + } else { + Res = Ext; + } + } + return Res; + } case VPInstruction::FirstActiveLane: { if (getNumOperands() == 1) { Value *Mask = State.get(getOperand(0)); @@ -921,7 +947,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { - case Instruction::ExtractElement: { + case Instruction::ExtractElement: + case VPInstruction::ExtractLane: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, @@ -983,6 +1010,7 @@ bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractLastElement || getOpcode() == VPInstruction::ExtractPenultimateElement || getOpcode() == Instruction::ExtractElement || + getOpcode() == VPInstruction::ExtractLane || getOpcode() == VPInstruction::FirstActiveLane || getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || @@ -1048,6 +1076,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::ExtractLane: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: @@ -1097,6 +1126,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::ComputeAnyOfResult: case VPInstruction::ComputeFindIVResult: return Op == getOperand(1); + case VPInstruction::ExtractLane: + return Op == getOperand(0); }; llvm_unreachable("switch should return"); } @@ -1176,6 +1207,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BuildVector: O << "buildvector"; break; + case VPInstruction::ExtractLane: + O << "extract-lane"; + break; case VPInstruction::ExtractLastElement: O << "extract-last-element"; break; @@ -3357,12 +3391,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // must use intrinsics to interleave. if (VecTy->isScalableTy()) { assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors"); - VectorType *InterleaveTy = - VectorType::get(VecTy->getElementType(), - VecTy->getElementCount().multiplyCoefficientBy(Factor)); - return Builder.CreateIntrinsic(InterleaveTy, - getInterleaveIntrinsicID(Factor), Vals, - /*FMFSource=*/nullptr, Name); + return Builder.CreateVectorInterleave(Vals, Name); } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -3469,8 +3498,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { assert(InterleaveFactor <= 8 && "Unsupported deinterleave factor for scalable vectors"); NewLoad = State.Builder.CreateIntrinsic( - getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(), - NewLoad, + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, /*FMFSource=*/nullptr, "strided.vec"); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 5da43b6..a1d12a3a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -32,11 +32,11 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Casting.h" #include "llvm/Support/TypeSize.h" using namespace llvm; +using namespace VPlanPatternMatch; bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlanPtr &Plan, @@ -528,13 +528,11 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) { /// Returns true if \p R is dead and can be removed. static bool isDeadRecipe(VPRecipeBase &R) { - using namespace llvm::PatternMatch; // Do remove conditional assume instructions as their conditions may be // flattened. auto *RepR = dyn_cast<VPReplicateRecipe>(&R); - bool IsConditionalAssume = - RepR && RepR->isPredicated() && - match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>()); + bool IsConditionalAssume = RepR && RepR->isPredicated() && + match(RepR, m_Intrinsic<Intrinsic::assume>()); if (IsConditionalAssume) return true; @@ -625,7 +623,6 @@ static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { /// original IV's users. This is an optional optimization to reduce the needs of /// vector extracts. static void legalizeAndOptimizeInductions(VPlan &Plan) { - using namespace llvm::VPlanPatternMatch; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); @@ -727,7 +724,6 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) { return nullptr; auto IsWideIVInc = [&]() { - using namespace VPlanPatternMatch; auto &ID = WideIV->getInductionDescriptor(); // Check if VPV increments the induction by the induction step. @@ -771,13 +767,11 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op) { - using namespace VPlanPatternMatch; - VPValue *Incoming, *Mask; - if (!match(Op, m_VPInstruction<Instruction::ExtractElement>( - m_VPValue(Incoming), + if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>( m_VPInstruction<VPInstruction::FirstActiveLane>( - m_VPValue(Mask))))) + m_VPValue(Mask)), + m_VPValue(Incoming)))) return nullptr; auto *WideIV = getOptimizableIVOf(Incoming); @@ -827,8 +821,6 @@ static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap<VPValue *, VPValue *> &EndValues) { - using namespace VPlanPatternMatch; - VPValue *Incoming; if (!match(Op, m_VPInstruction<VPInstruction::ExtractLastElement>( m_VPValue(Incoming)))) @@ -986,7 +978,6 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, /// Try to simplify recipe \p R. static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - using namespace llvm::VPlanPatternMatch; VPlan *Plan = R.getParent()->getPlan(); auto *Def = dyn_cast<VPSingleDefRecipe>(&R); @@ -997,7 +988,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // InstSimplifyFolder. if (TypeSwitch<VPRecipeBase *, bool>(&R) .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, - VPReplicateRecipe>([&](auto *I) { + VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) { const DataLayout &DL = Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL, @@ -1094,6 +1085,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return Def->replaceAllUsesWith(A); + if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0)))) + return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) + : R.getOperand(0)); + if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) return Def->replaceAllUsesWith(A); @@ -1172,6 +1167,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (!Plan->isUnrolled()) return; + // VPVectorPointer for part 0 can be replaced by their start pointer. + if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) { + if (VecPtr->isFirstPart()) { + VecPtr->replaceAllUsesWith(VecPtr->getOperand(0)); + return; + } + } + // VPScalarIVSteps for part 0 can be replaced by their start value, if only // the first lane is demanded. if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) { @@ -1257,7 +1260,6 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { /// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes /// to make sure the masks are simplified. static void simplifyBlends(VPlan &Plan) { - using namespace llvm::VPlanPatternMatch; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { @@ -1307,8 +1309,9 @@ static void simplifyBlends(VPlan &Plan) { OperandsWithMask.push_back(Blend->getMask(I)); } - auto *NewBlend = new VPBlendRecipe( - cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask); + auto *NewBlend = + new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()), + OperandsWithMask, Blend->getDebugLoc()); NewBlend->insertBefore(&R); VPValue *DeadMask = Blend->getMask(StartIndex); @@ -1361,7 +1364,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, unsigned NewBitWidth = ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF); - LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext(); + LLVMContext &Ctx = Plan.getContext(); auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth); bool MadeChange = false; @@ -1380,7 +1383,6 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, // Currently only handle cases where the single user is a header-mask // comparison with the backedge-taken-count. - using namespace VPlanPatternMatch; if (!match( *WideIV->user_begin(), m_Binary<Instruction::ICmp>( @@ -1411,7 +1413,6 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE) { - using namespace llvm::VPlanPatternMatch; if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue()))) return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF, &SE](VPValue *C) { @@ -1451,7 +1452,6 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, auto *Term = &ExitingVPBB->back(); VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); - using namespace llvm::VPlanPatternMatch; if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || match(Term, m_BranchOnCond( m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { @@ -1834,7 +1834,6 @@ void VPlanTransforms::truncateToMinimalBitwidths( if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R)) VPW->dropPoisonGeneratingFlags(); - using namespace llvm::VPlanPatternMatch; if (OldResSizeInBits != NewResSizeInBits && !match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) { // Extend result to original width. @@ -1883,10 +1882,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( } } -/// Remove BranchOnCond recipes with true or false conditions together with -/// removing dead edges to their successors. -static void removeBranchOnConst(VPlan &Plan) { - using namespace llvm::VPlanPatternMatch; +void VPlanTransforms::removeBranchOnConst(VPlan &Plan) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(Plan.getEntry()))) { VPValue *Cond; @@ -1908,12 +1904,9 @@ static void removeBranchOnConst(VPlan &Plan) { "There must be a single edge between VPBB and its successor"); // Values coming from VPBB into phi recipes of RemoveSucc are removed from // these recipes. - for (VPRecipeBase &R : RemovedSucc->phis()) { - auto *Phi = cast<VPPhiAccessors>(&R); - assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) && - "VPIRPhis must have a single predecessor"); - Phi->removeIncomingValueFor(VPBB); - } + for (VPRecipeBase &R : RemovedSucc->phis()) + cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB); + // Disconnect blocks and remove the terminator. RemovedSucc will be deleted // automatically on VPlan destruction if it becomes unreachable. VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc); @@ -2135,7 +2128,6 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL) { - using namespace llvm::VPlanPatternMatch; auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { assert(OrigMask && "Unmasked recipe when folding tail"); // HeaderMask will be handled using EVL. @@ -2215,7 +2207,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : *VPBB) { - using namespace VPlanPatternMatch; VPValue *V1, *V2; if (!match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( @@ -2236,6 +2227,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { // Try to optimize header mask recipes away to their EVL variants. for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { + // TODO: Split optimizeMaskToEVL out and move into + // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in + // tryToBuildVPlanWithVPRecipes beforehand. for (VPUser *U : collectUsersRecursively(HeaderMask)) { auto *CurRecipe = cast<VPRecipeBase>(U); VPRecipeBase *EVLRecipe = @@ -2257,6 +2251,20 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { } ToErase.push_back(CurRecipe); } + + // Replace header masks with a mask equivalent to predicating by EVL: + // + // icmp ule widen-canonical-iv backedge-taken-count + // -> + // icmp ult step-vector, EVL + VPRecipeBase *EVLR = EVL.getDefiningRecipe(); + VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator())); + Type *EVLType = TypeInfo.inferScalarType(&EVL); + VPValue *EVLMask = Builder.createICmp( + CmpInst::ICMP_ULT, + Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL); + HeaderMask->replaceAllUsesWith(EVLMask); + ToErase.push_back(HeaderMask->getDefiningRecipe()); } for (VPRecipeBase *R : reverse(ToErase)) { @@ -2365,6 +2373,65 @@ bool VPlanTransforms::tryAddExplicitVectorLength( return true; } +void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { + // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe. + // There should be only one EVL PHI in the entire plan. + VPEVLBasedIVPHIRecipe *EVLPhi = nullptr; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(Plan.getEntry()))) + for (VPRecipeBase &R : VPBB->phis()) + if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) { + assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected"); + EVLPhi = PhiR; + } + + // Early return if no EVL PHI is found. + if (!EVLPhi) + return; + + VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); + VPValue *EVLIncrement = EVLPhi->getBackedgeValue(); + + // Convert EVLPhi to concrete recipe. + auto *ScalarR = + VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement}, + EVLPhi->getDebugLoc(), "evl.based.iv"); + EVLPhi->replaceAllUsesWith(ScalarR); + EVLPhi->eraseFromParent(); + + // Replace CanonicalIVInc with EVL-PHI increment. + auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin()); + VPValue *Backedge = CanonicalIV->getIncomingValue(1); + assert(match(Backedge, + m_c_Binary<Instruction::Add>(m_Specific(CanonicalIV), + m_Specific(&Plan.getVFxUF()))) && + "Unexpected canonical iv"); + Backedge->replaceAllUsesWith(EVLIncrement); + + // Remove unused phi and increment. + VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe(); + CanonicalIVIncrement->eraseFromParent(); + CanonicalIV->eraseFromParent(); + + // Replace the use of VectorTripCount in the latch-exiting block. + // Before: (branch-on-count EVLIVInc, VectorTripCount) + // After: (branch-on-count EVLIVInc, TripCount) + + VPBasicBlock *LatchExiting = + HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock(); + auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator()); + // Skip single-iteration loop region + if (match(LatchExitingBr, m_BranchOnCond(m_True()))) + return; + assert(LatchExitingBr && + match(LatchExitingBr, + m_BranchOnCount(m_VPValue(EVLIncrement), + m_Specific(&Plan.getVectorTripCount()))) && + "Unexpected terminator in EVL loop"); + LatchExitingBr->setOperand(1, Plan.getTripCount()); +} + void VPlanTransforms::dropPoisonGeneratingRecipes( VPlan &Plan, const std::function<bool(BasicBlock *)> &BlockNeedsPredication) { @@ -2395,7 +2462,6 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( // drop them directly. if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { VPValue *A, *B; - using namespace llvm::VPlanPatternMatch; // Dropping disjoint from an OR may yield incorrect results, as some // analysis may have converted it to an Add implicitly (e.g. SCEV used // for dependence analysis). Instead, replace it with an equivalent Add. @@ -2515,8 +2581,8 @@ void VPlanTransforms::createInterleaveGroups( DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) * IG->getIndex(IRInsertPos), /*IsSigned=*/true); - VPValue *OffsetVPV = Plan.getOrAddLiveIn( - ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset)); + VPValue *OffsetVPV = + Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset)); VPBuilder B(InsertPos); Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV) : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV); @@ -2689,22 +2755,11 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { - using namespace llvm::VPlanPatternMatch; - VPTypeAnalysis TypeInfo(&CanonicalIVTy); SmallVector<VPRecipeBase *> ToRemove; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) { - auto *ScalarR = VPBuilder(PhiR).createScalarPhi( - {PhiR->getStartValue(), PhiR->getBackedgeValue()}, - PhiR->getDebugLoc(), "evl.based.iv"); - PhiR->replaceAllUsesWith(ScalarR); - ToRemove.push_back(PhiR); - continue; - } - if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) { expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo); ToRemove.push_back(WidenIVR); @@ -2776,8 +2831,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, void VPlanTransforms::handleUncountableEarlyExit( VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) { - using namespace llvm::VPlanPatternMatch; - VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0]; if (!EarlyExitVPBB->getSinglePredecessor() && EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) { @@ -2842,7 +2895,7 @@ void VPlanTransforms::handleUncountableEarlyExit( VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr, "first.active.lane"); IncomingFromEarlyExit = EarlyExitB.createNaryOp( - Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane}, + VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit}, nullptr, "early.exit.value"); ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit); } @@ -2871,8 +2924,6 @@ void VPlanTransforms::handleUncountableEarlyExit( static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { - using namespace VPlanPatternMatch; - Type *RedTy = Ctx.Types.inferScalarType(Red); VPValue *VecOp = Red->getVecOp(); @@ -2918,8 +2969,6 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { - using namespace VPlanPatternMatch; - unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); if (Opcode != Instruction::Add) return nullptr; @@ -3093,6 +3142,29 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { } } +void VPlanTransforms::materializeVectorTripCount( + VPlan &Plan, ElementCount BestVF, unsigned BestUF, + PredicatedScalarEvolution &PSE) { + assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); + assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); + + VPValue *TC = Plan.getTripCount(); + // Skip cases for which the trip count may be non-trivial to materialize. + if (!Plan.hasScalarTail() || + Plan.getMiddleBlock()->getSingleSuccessor() == + Plan.getScalarPreheader() || + !TC->isLiveIn()) + return; + // Materialize vector trip counts for constants early if it can simply + // be computed as (Original TC / VF * UF) * VF * UF. + ScalarEvolution &SE = *PSE.getSE(); + auto *TCScev = SE.getSCEV(TC->getLiveInIRValue()); + const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF); + auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF); + if (auto *NewC = dyn_cast<SCEVConstant>(VecTCScev)) + Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue()); +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding @@ -3110,9 +3182,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV; if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR)) - return IR->getInterleaveGroup()->getFactor() == - IR->getInterleaveGroup()->getNumMembers() && - IR->getVPValue(Idx) == OpV; + return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV; return false; } @@ -3159,7 +3229,6 @@ static bool isAlreadyNarrow(VPValue *VPV) { void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { - using namespace llvm::VPlanPatternMatch; VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); if (VF.isScalable() || !VectorLoop) return; @@ -3229,9 +3298,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, if (!DefR) return false; auto *IR = dyn_cast<VPInterleaveRecipe>(DefR); - return IR && - IR->getInterleaveGroup()->getFactor() == - IR->getInterleaveGroup()->getNumMembers() && + return IR && IR->getInterleaveGroup()->isFull() && IR->getVPValue(Op.index()) == Op.value(); })) { StoreGroups.push_back(InterleaveR); @@ -3350,7 +3417,7 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator( if (VF.isScalable() && VScaleForTuning.has_value()) VectorStep *= *VScaleForTuning; assert(VectorStep > 0 && "trip count should not be zero"); - MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext()); + MDBuilder MDB(Plan.getContext()); MDNode *BranchWeights = MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false); MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index ab189f6..880159f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -209,6 +209,18 @@ struct VPlanTransforms { /// Replace loop regions with explicit CFG. static void dissolveLoopRegions(VPlan &Plan); + /// Transform EVL loops to use variable-length stepping after region + /// dissolution. + /// + /// Once loop regions are replaced with explicit CFG, EVL loops can step with + /// variable vector lengths instead of fixed lengths. This transformation: + /// * Makes EVL-Phi concrete. + // * Removes CanonicalIV and increment. + /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc, + /// VectorTripCount) with variable-length stepping (branch-on-cond + /// EVLIVInc, TripCount). + static void canonicalizeEVLLoops(VPlan &Plan); + /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); @@ -224,6 +236,10 @@ struct VPlanTransforms { /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// Remove BranchOnCond recipes with true or false conditions together with + /// removing dead edges to their successors. + static void removeBranchOnConst(VPlan &Plan); + /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. @@ -234,6 +250,12 @@ struct VPlanTransforms { /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors. static void materializeBroadcasts(VPlan &Plan); + // Materialize vector trip counts for constants early if it can simply be + // computed as (Original TC / VF * UF) * VF * UF. + static void materializeVectorTripCount(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE); + /// Try to convert a plan with interleave groups with VF elements to a plan /// with the interleave groups replaced by wide loads and stores processing VF /// elements, if all transformed interleave groups access the full vector diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index b89cd21..871e37e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -363,6 +363,13 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { continue; } VPValue *Op0; + if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>( + m_VPValue(Op0), m_VPValue(Op1)))) { + addUniformForAllParts(cast<VPInstruction>(&R)); + for (unsigned Part = 1; Part != UF; ++Part) + R.addOperand(getValueForPart(Op1, Part)); + continue; + } if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>( m_VPValue(Op0))) || match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 38ada33..57d01cb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -17,6 +17,7 @@ #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "VPlanHelpers.h" +#include "VPlanPatternMatch.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/TypeSwitch.h" @@ -171,7 +172,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case<VPInstructionWithType>( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) .Case<VPInstruction>([&](const VPInstruction *I) { - if (I->getOpcode() == Instruction::PHI) + if (I->getOpcode() == Instruction::PHI || + I->getOpcode() == Instruction::ICmp) return VerifyEVLUse(*I, 1); switch (I->getOpcode()) { case Instruction::Add: @@ -192,7 +194,13 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { errs() << "EVL used by unexpected VPInstruction\n"; return false; } - if (I->getNumUsers() != 1) { + // EVLIVIncrement is only used by EVLIV & BranchOnCount. + // Having more than two users is unexpected. + if ((I->getNumUsers() != 1) && + (I->getNumUsers() != 2 || none_of(I->users(), [&I](VPUser *U) { + using namespace llvm::VPlanPatternMatch; + return match(U, m_BranchOnCount(m_Specific(I), m_VPValue())); + }))) { errs() << "EVL is used in VPInstruction with multiple users\n"; return false; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 82adc34..6252f4f 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3174,6 +3174,55 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) { return true; } +/// Returns true if this ShuffleVectorInst eventually feeds into a +/// vector reduction intrinsic (e.g., vector_reduce_add) by only following +/// chains of shuffles and binary operators (in any combination/order). +/// The search does not go deeper than the given Depth. +static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) { + constexpr unsigned MaxVisited = 32; + SmallPtrSet<Instruction *, 8> Visited; + SmallVector<Instruction *, 4> WorkList; + bool FoundReduction = false; + + WorkList.push_back(SVI); + while (!WorkList.empty()) { + Instruction *I = WorkList.pop_back_val(); + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (!UI || !Visited.insert(UI).second) + continue; + if (Visited.size() > MaxVisited) + return false; + if (auto *II = dyn_cast<IntrinsicInst>(UI)) { + // More than one reduction reached + if (FoundReduction) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + FoundReduction = true; + continue; + default: + return false; + } + } + + if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI)) + return false; + + WorkList.emplace_back(UI); + } + } + return FoundReduction; +} + /// This method looks for groups of shuffles acting on binops, of the form: /// %x = shuffle ... /// %y = shuffle ... @@ -3416,6 +3465,65 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind); }; + unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits(); + unsigned MaxVectorSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); + unsigned MaxElementsInVector = MaxVectorSize / ElementSize; + // When there are multiple shufflevector operations on the same input, + // especially when the vector length is larger than the register size, + // identical shuffle patterns may occur across different groups of elements. + // To avoid overestimating the cost by counting these repeated shuffles more + // than once, we only account for unique shuffle patterns. This adjustment + // prevents inflated costs in the cost model for wide vectors split into + // several register-sized groups. + std::set<SmallVector<int, 4>> UniqueShuffles; + auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) { + // Compute the cost for performing the shuffle over the full vector. + auto ShuffleCost = + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind); + unsigned NumFullVectors = Mask.size() / MaxElementsInVector; + if (NumFullVectors < 2) + return C + ShuffleCost; + SmallVector<int, 4> SubShuffle(MaxElementsInVector); + unsigned NumUniqueGroups = 0; + unsigned NumGroups = Mask.size() / MaxElementsInVector; + // For each group of MaxElementsInVector contiguous elements, + // collect their shuffle pattern and insert into the set of unique patterns. + for (unsigned I = 0; I < NumFullVectors; ++I) { + for (unsigned J = 0; J < MaxElementsInVector; ++J) + SubShuffle[J] = Mask[MaxElementsInVector * I + J]; + if (UniqueShuffles.insert(SubShuffle).second) + NumUniqueGroups += 1; + } + return C + ShuffleCost * NumUniqueGroups / NumGroups; + }; + auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return C; + SmallVector<int, 16> Mask; + SV->getShuffleMask(Mask); + return AddShuffleMaskAdjustedCost(C, Mask); + }; + // Check that input consists of ShuffleVectors applied to the same input + auto AllShufflesHaveSameOperands = + [](SmallPtrSetImpl<Instruction *> &InputShuffles) { + if (InputShuffles.size() < 2) + return false; + ShuffleVectorInst *FirstSV = + dyn_cast<ShuffleVectorInst>(*InputShuffles.begin()); + if (!FirstSV) + return false; + + Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1); + return std::all_of( + std::next(InputShuffles.begin()), InputShuffles.end(), + [&](Instruction *I) { + ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I); + return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1; + }); + }; + // Get the costs of the shuffles + binops before and after with the new // shuffle masks. InstructionCost CostBefore = @@ -3423,8 +3531,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); - CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), - InstructionCost(0), AddShuffleCost); + if (AllShufflesHaveSameOperands(InputShuffles)) { + UniqueShuffles.clear(); + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleAdjustedCost); + } else { + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleCost); + } // The new binops will be unused for lanes past the used shuffle lengths. // These types attempt to get the correct cost for that from the target. @@ -3435,8 +3549,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { InstructionCost CostAfter = TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) + TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind); + UniqueShuffles.clear(); CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), - InstructionCost(0), AddShuffleMaskCost); + InstructionCost(0), AddShuffleMaskAdjustedCost); std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); CostAfter += std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), @@ -3445,7 +3560,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore << " vs CostAfter: " << CostAfter << "\n"); - if (CostBefore <= CostAfter) + if (CostBefore < CostAfter || + (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI))) return false; // The cost model has passed, create the new instructions. |