diff options
Diffstat (limited to 'llvm/lib')
234 files changed, 5849 insertions, 3435 deletions
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index e71ba5e..759c553 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -929,12 +929,11 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, if (!AllConstantInt) break; - // TODO: Try to intersect two inrange attributes? - if (!InRange) { - InRange = GEP->getInRange(); - if (InRange) - // Adjust inrange by offset until now. - InRange = InRange->sextOrTrunc(BitWidth).subtract(Offset); + // Adjust inrange offset and intersect inrange attributes + if (auto GEPRange = GEP->getInRange()) { + auto AdjustedGEPRange = GEPRange->sextOrTrunc(BitWidth).subtract(Offset); + InRange = + InRange ? InRange->intersectWith(AdjustedGEPRange) : AdjustedGEPRange; } Ptr = cast<Constant>(GEP->getOperand(0)); @@ -2004,21 +2003,20 @@ inline bool llvm_fenv_testexcept() { return false; } -static const APFloat FTZPreserveSign(const APFloat &V) { +static APFloat FTZPreserveSign(const APFloat &V) { if (V.isDenormal()) return APFloat::getZero(V.getSemantics(), V.isNegative()); return V; } -static const APFloat FlushToPositiveZero(const APFloat &V) { +static APFloat FlushToPositiveZero(const APFloat &V) { if (V.isDenormal()) return APFloat::getZero(V.getSemantics(), false); return V; } -static const APFloat -FlushWithDenormKind(const APFloat &V, - DenormalMode::DenormalModeKind DenormKind) { +static APFloat FlushWithDenormKind(const APFloat &V, + DenormalMode::DenormalModeKind DenormKind) { assert(DenormKind != DenormalMode::DenormalModeKind::Invalid && DenormKind != DenormalMode::DenormalModeKind::Dynamic); switch (DenormKind) { diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 2da6468..1959ab6 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -1079,15 +1079,16 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) { // add new space S = &BS->Spaces.emplace_back(B.Space); - // the space is full - set flag to report overlapping binding later - if (S->FreeRanges.empty()) { + // The space is full - there are no free slots left, or the rest of the + // slots are taken by an unbounded array. Set flag to report overlapping + // binding later. + if (S->FreeRanges.empty() || S->FreeRanges.back().UpperBound < UINT32_MAX) { OverlappingBinding = true; continue; } // adjust the last free range lower bound, split it in two, or remove it BindingRange &LastFreeRange = S->FreeRanges.back(); - assert(LastFreeRange.UpperBound == UINT32_MAX); if (LastFreeRange.LowerBound == B.LowerBound) { if (B.UpperBound < UINT32_MAX) LastFreeRange.LowerBound = B.UpperBound + 1; diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index dd9a44b..f1473b2 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -3383,6 +3383,10 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst, SrcSubscripts, DstSubscripts)) return false; + assert(isLoopInvariant(SrcBase, SrcLoop) && + isLoopInvariant(DstBase, DstLoop) && + "Expected SrcBase and DstBase to be loop invariant"); + int Size = SrcSubscripts.size(); LLVM_DEBUG({ dbgs() << "\nSrcSubscripts: "; @@ -3666,6 +3670,19 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, SCEVUnionPredicate(Assume, *SE)); } + // Even if the base pointers are the same, they may not be loop-invariant. It + // could lead to incorrect results, as we're analyzing loop-carried + // dependencies. Src and Dst can be in different loops, so we need to check + // the base pointer is invariant in both loops. + Loop *SrcLoop = LI->getLoopFor(Src->getParent()); + Loop *DstLoop = LI->getLoopFor(Dst->getParent()); + if (!isLoopInvariant(SrcBase, SrcLoop) || + !isLoopInvariant(DstBase, DstLoop)) { + LLVM_DEBUG(dbgs() << "The base pointer is not loop invariant.\n"); + return std::make_unique<Dependence>(Src, Dst, + SCEVUnionPredicate(Assume, *SE)); + } + uint64_t EltSize = SrcLoc.Size.toRaw(); const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase); const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase); diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 3aa9909..2b0f212 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -983,33 +983,37 @@ MemDepResult MemoryDependenceResults::getNonLocalInfoForBlock( static void SortNonLocalDepInfoCache(MemoryDependenceResults::NonLocalDepInfo &Cache, unsigned NumSortedEntries) { - switch (Cache.size() - NumSortedEntries) { - case 0: - // done, no new entries. - break; - case 2: { - // Two new entries, insert the last one into place. - NonLocalDepEntry Val = Cache.back(); - Cache.pop_back(); - MemoryDependenceResults::NonLocalDepInfo::iterator Entry = - std::upper_bound(Cache.begin(), Cache.end() - 1, Val); - Cache.insert(Entry, Val); - [[fallthrough]]; + + // If only one entry, don't sort. + if (Cache.size() < 2) + return; + + unsigned s = Cache.size() - NumSortedEntries; + + // If the cache is already sorted, don't sort it again. + if (s == 0) + return; + + // If no entry is sorted, sort the whole cache. + if (NumSortedEntries == 0) { + llvm::sort(Cache); + return; } - case 1: - // One new entry, Just insert the new value at the appropriate position. - if (Cache.size() != 1) { + + // If the number of unsorted entires is small and the cache size is big, using + // insertion sort is faster. Here use Log2_32 to quickly choose the sort + // method. + if (s < Log2_32(Cache.size())) { + while (s > 0) { NonLocalDepEntry Val = Cache.back(); Cache.pop_back(); MemoryDependenceResults::NonLocalDepInfo::iterator Entry = - llvm::upper_bound(Cache, Val); + std::upper_bound(Cache.begin(), Cache.end() - s + 1, Val); Cache.insert(Entry, Val); + s--; } - break; - default: - // Added many values, do a full scale sort. + } else { llvm::sort(Cache); - break; } } diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp index e8d4e37..f1c3155 100644 --- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp +++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp @@ -121,8 +121,18 @@ void ProfileSummaryInfo::computeThresholds() { ProfileSummaryBuilder::getHotCountThreshold(DetailedSummary); ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(DetailedSummary); - assert(ColdCountThreshold <= HotCountThreshold && - "Cold count threshold cannot exceed hot count threshold!"); + // When the hot and cold thresholds are identical, we would classify + // a count value as both hot and cold since we are doing an inclusive check + // (see ::is{Hot|Cold}Count(). To avoid this undesirable overlap, ensure the + // thresholds are distinct. + if (HotCountThreshold == ColdCountThreshold) { + if (ColdCountThreshold > 0) + (*ColdCountThreshold)--; + else + (*HotCountThreshold)++; + } + assert(ColdCountThreshold < HotCountThreshold && + "Cold count threshold should be less than hot count threshold!"); if (!hasPartialSampleProfile() || !ScalePartialSampleProfileWorkingSetSize) { HasHugeWorkingSetSize = HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold; diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index e475be2..6e92766 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -875,6 +875,34 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_toascii); } + if (T.isOSFreeBSD()) { + TLI.setAvailable(LibFunc_dunder_strtok_r); + TLI.setAvailable(LibFunc_memalign); + TLI.setAvailable(LibFunc_fputc_unlocked); + TLI.setAvailable(LibFunc_fputs_unlocked); + TLI.setAvailable(LibFunc_fread_unlocked); + TLI.setAvailable(LibFunc_fwrite_unlocked); + TLI.setAvailable(LibFunc_getc_unlocked); + TLI.setAvailable(LibFunc_getchar_unlocked); + TLI.setAvailable(LibFunc_putc_unlocked); + TLI.setAvailable(LibFunc_putchar_unlocked); + + TLI.setUnavailable(LibFunc___kmpc_alloc_shared); + TLI.setUnavailable(LibFunc___kmpc_free_shared); + TLI.setUnavailable(LibFunc_dunder_strndup); + TLI.setUnavailable(LibFunc_memccpy_chk); + TLI.setUnavailable(LibFunc_strlen_chk); + TLI.setUnavailable(LibFunc_fmaximum_num); + TLI.setUnavailable(LibFunc_fmaximum_numf); + TLI.setUnavailable(LibFunc_fmaximum_numl); + TLI.setUnavailable(LibFunc_fminimum_num); + TLI.setUnavailable(LibFunc_fminimum_numf); + TLI.setUnavailable(LibFunc_fminimum_numl); + TLI.setUnavailable(LibFunc_roundeven); + TLI.setUnavailable(LibFunc_roundevenf); + TLI.setUnavailable(LibFunc_roundevenl); + } + // As currently implemented in clang, NVPTX code has no standard library to // speak of. Headers provide a standard-ish library implementation, but many // of the signatures are wrong -- for example, many libm functions are not diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp index 5d7c97a..6356d71 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp @@ -37,8 +37,8 @@ void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA, // unsigned long personality; /* Pointer to the personality routine */ // } - auto *EHInfo = - cast<MCSectionXCOFF>(Asm->getObjFileLowering().getCompactUnwindSection()); + auto *EHInfo = static_cast<MCSectionXCOFF *>( + Asm->getObjFileLowering().getCompactUnwindSection()); if (Asm->TM.getFunctionSections()) { // If option -ffunction-sections is on, append the function name to the // name of EH Info Table csect so that each function has its own EH Info diff --git a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp index de6ebcf..51342c6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -39,7 +39,7 @@ void ARMException::beginFunction(const MachineFunction *MF) { if (CFISecType == AsmPrinter::CFISection::Debug) { if (!hasEmittedCFISections) { if (Asm->getModuleCFISectionType() == AsmPrinter::CFISection::Debug) - Asm->OutStreamer->emitCFISections(false, true); + Asm->OutStreamer->emitCFISections(false, true, false); hasEmittedCFISections = true; } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index f1d3e96..6166271 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -4221,10 +4221,11 @@ MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const { SectionKind Kind = CPE.getSectionKind(&DL); const Constant *C = CPE.Val.ConstVal; Align Alignment = CPE.Alignment; - if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( - getObjFileLowering().getSectionForConstant(DL, Kind, C, - Alignment))) { - if (MCSymbol *Sym = S->getCOMDATSymbol()) { + auto *S = + getObjFileLowering().getSectionForConstant(DL, Kind, C, Alignment); + if (S && TM.getTargetTriple().isOSBinFormatCOFF()) { + if (MCSymbol *Sym = + static_cast<const MCSectionCOFF *>(S)->getCOMDATSymbol()) { if (Sym->isUndefined()) OutStreamer->emitSymbolAttribute(Sym, MCSA_Global); return Sym; diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 8abeb56..c5d6e40 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -1051,10 +1051,10 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) { // comdat key. A section may be comdat because of -ffunction-sections or // because it is comdat in the IR. MCSectionCOFF *GVSec = - GVSym ? dyn_cast<MCSectionCOFF>(&GVSym->getSection()) : nullptr; + GVSym ? static_cast<MCSectionCOFF *>(&GVSym->getSection()) : nullptr; const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr; - MCSectionCOFF *DebugSec = cast<MCSectionCOFF>( + auto *DebugSec = static_cast<MCSectionCOFF *>( CompilerInfoAsm->getObjFileLowering().getCOFFDebugSymbolsSection()); DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index 4fac4bb..6b8d08c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -109,9 +109,11 @@ void DwarfCFIException::beginBasicBlockSection(const MachineBasicBlock &MBB) { // chose not to be verbose in that case. And with `ForceDwarfFrameSection`, // we should always emit .debug_frame. if (CFISecType == AsmPrinter::CFISection::Debug || - Asm->TM.Options.ForceDwarfFrameSection) + Asm->TM.Options.ForceDwarfFrameSection || + Asm->TM.Options.MCOptions.EmitSFrameUnwind) Asm->OutStreamer->emitCFISections( - CFISecType == AsmPrinter::CFISection::EH, true); + CFISecType == AsmPrinter::CFISection::EH, true, + Asm->TM.Options.MCOptions.EmitSFrameUnwind); hasEmittedCFISections = true; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 8e8cda4..5577a7d 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1379,7 +1379,7 @@ void DwarfCompileUnit::constructCallSiteParmEntryDIEs( DIE *DwarfCompileUnit::constructImportedEntityDIE( const DIImportedEntity *Module) { - DIE *IMDie = DIE::get(DIEValueAllocator, (dwarf::Tag)Module->getTag()); + DIE *IMDie = DIE::get(DIEValueAllocator, Module->getTag()); insertDIE(Module, IMDie); DIE *EntityDie; auto *Entity = Module->getEntity(); diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index c3b4077..989cf4c4 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -45,7 +45,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeExpandPostRALegacyPass(Registry); initializeFEntryInserterLegacyPass(Registry); initializeFinalizeISelPass(Registry); - initializeFinalizeMachineBundlesPass(Registry); initializeFixupStatepointCallerSavedLegacyPass(Registry); initializeFuncletLayoutPass(Registry); initializeGCMachineCodeAnalysisPass(Registry); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index c21058c..416c56d 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2095,6 +2095,10 @@ static bool isRemOfLoopIncrementWithLoopInvariant( if (!L->isLoopInvariant(RemAmt)) return false; + // Only works if the AddOffset is a loop invaraint + if (AddOffset && !L->isLoopInvariant(AddOffset)) + return false; + // Is the PHI a loop increment? auto LoopIncrInfo = getIVIncrement(PN, LI); if (!LoopIncrInfo) diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 714ec55..1c1047c 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -103,10 +103,10 @@ static void expandFPToI(Instruction *FPToI) { Value *A1 = nullptr; if (FloatVal->getType()->isHalfTy()) { if (FPToI->getOpcode() == Instruction::FPToUI) { - Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getIntNTy(32)); + Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getInt32Ty()); A1 = Builder.CreateZExt(A0, IntTy); } else { // FPToSI - Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getIntNTy(32)); + Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getInt32Ty()); A1 = Builder.CreateSExt(A0, IntTy); } FPToI->replaceAllUsesWith(A1); @@ -425,8 +425,8 @@ static void expandIToFP(Instruction *IToFP) { AAddr0->addIncoming(IsSigned ? Sub : IntVal, IfThen4); AAddr0->addIncoming(Shl, SwBB); Value *A0 = Builder.CreateTrunc(AAddr0, Builder.getInt32Ty()); - Value *A1 = Builder.CreateLShr(A0, Builder.getIntN(32, 2)); - Value *A2 = Builder.CreateAnd(A1, Builder.getIntN(32, 1)); + Value *A1 = Builder.CreateLShr(A0, Builder.getInt32(2)); + Value *A2 = Builder.CreateAnd(A1, Builder.getInt32(1)); Value *Conv16 = Builder.CreateZExt(A2, IntTy); Value *Or17 = Builder.CreateOr(AAddr0, Conv16); Value *Inc = Builder.CreateAdd(Or17, Builder.getIntN(BitWidth, 1)); @@ -457,9 +457,9 @@ static void expandIToFP(Instruction *IToFP) { Value *Extract = Builder.CreateLShr(Shr21, Builder.getIntN(BitWidth, 32)); Value *ExtractT62 = nullptr; if (FloatWidth > 80) - ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getIntNTy(64)); + ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getInt64Ty()); else - ExtractT62 = Builder.CreateTrunc(Extract, Builder.getIntNTy(32)); + ExtractT62 = Builder.CreateTrunc(Extract, Builder.getInt32Ty()); Builder.CreateBr(IfEnd26); // if.else: @@ -475,7 +475,7 @@ static void expandIToFP(Instruction *IToFP) { Value *Extract65 = Builder.CreateLShr(Shl26, Builder.getIntN(BitWidth, 32)); Value *ExtractT66 = nullptr; if (FloatWidth > 80) - ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getIntNTy(64)); + ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getInt64Ty()); else ExtractT66 = Builder.CreateTrunc(Extract65, Builder.getInt32Ty()); Builder.CreateBr(IfEnd26); @@ -507,30 +507,29 @@ static void expandIToFP(Instruction *IToFP) { Builder.getIntN(BitWidth, 63)); And29 = Builder.CreateAnd(Shr, Temp2, "and29"); } else { - Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getIntNTy(32)); + Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getInt32Ty()); And29 = Builder.CreateAnd( - Conv28, ConstantInt::getSigned(Builder.getIntNTy(32), 0x80000000)); + Conv28, ConstantInt::getSigned(Builder.getInt32Ty(), 0x80000000)); } unsigned TempMod = FPMantissaWidth % 32; Value *And34 = nullptr; Value *Shl30 = nullptr; if (FloatWidth > 80) { TempMod += 32; - Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getIntN(64, TempMod)); + Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getInt64(TempMod)); Shl30 = Builder.CreateAdd( - Add, - Builder.getIntN(64, ((1ull << (62ull - TempMod)) - 1ull) << TempMod)); - And34 = Builder.CreateZExt(Shl30, Builder.getIntNTy(128)); + Add, Builder.getInt64(((1ull << (62ull - TempMod)) - 1ull) << TempMod)); + And34 = Builder.CreateZExt(Shl30, Builder.getInt128Ty()); } else { - Value *Add = Builder.CreateShl(E0, Builder.getIntN(32, TempMod)); + Value *Add = Builder.CreateShl(E0, Builder.getInt32(TempMod)); Shl30 = Builder.CreateAdd( - Add, Builder.getIntN(32, ((1 << (30 - TempMod)) - 1) << TempMod)); + Add, Builder.getInt32(((1 << (30 - TempMod)) - 1) << TempMod)); And34 = Builder.CreateAnd(FloatWidth > 32 ? AAddr1Off32 : AAddr1Off0, - Builder.getIntN(32, (1 << TempMod) - 1)); + Builder.getInt32((1 << TempMod) - 1)); } Value *Or35 = nullptr; if (FloatWidth > 80) { - Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getIntNTy(128)); + Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getInt128Ty()); Value *Or31 = Builder.CreateOr(And29Trunc, And34); Value *Or34 = Builder.CreateShl(Or31, Builder.getIntN(128, 64)); Value *Temp3 = Builder.CreateShl(Builder.getIntN(128, 1), diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 1b69188..5e50898 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -253,6 +253,21 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return false; } +static Value *getMaskOperand(IntrinsicInst *II) { + switch (II->getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic"); + case Intrinsic::vp_load: + return II->getOperand(1); + case Intrinsic::masked_load: + return II->getOperand(2); + case Intrinsic::vp_store: + return II->getOperand(2); + case Intrinsic::masked_store: + return II->getOperand(3); + } +} + // Return the corresponded deinterleaved mask, or nullptr if there is no valid // mask. static Value *getMask(Value *WideMask, unsigned Factor, @@ -268,8 +283,12 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( if (isa<ScalableVectorType>(Load->getType())) return false; - if (auto *LI = dyn_cast<LoadInst>(Load); - LI && !LI->isSimple()) + auto *LI = dyn_cast<LoadInst>(Load); + auto *II = dyn_cast<IntrinsicInst>(Load); + if (!LI && !II) + return false; + + if (LI && !LI->isSimple()) return false; // Check if all users of this load are shufflevectors. If we encounter any @@ -322,7 +341,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( // Holds the corresponding index for each DE-interleave shuffle. SmallVector<unsigned, 4> Indices; - Type *VecTy = FirstSVI->getType(); + VectorType *VecTy = cast<VectorType>(FirstSVI->getType()); // Check if other shufflevectors are also DE-interleaved of the same type // and factor as the first shufflevector. @@ -360,13 +379,16 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); Value *Mask = nullptr; - if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) { - Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy)); + if (LI) { + LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); + } else { + // Check mask operand. Handle both all-true/false and interleaved mask. + Mask = getMask(getMaskOperand(II), Factor, VecTy); if (!Mask) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n"); - } else { - LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); + + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " + << *Load << "\n"); } // Try to create target specific intrinsics to replace the load and @@ -483,15 +505,16 @@ bool InterleavedAccessImpl::tryReplaceExtracts( bool InterleavedAccessImpl::lowerInterleavedStore( Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) { Value *StoredValue; - if (auto *SI = dyn_cast<StoreInst>(Store)) { + auto *SI = dyn_cast<StoreInst>(Store); + auto *II = dyn_cast<IntrinsicInst>(Store); + if (SI) { if (!SI->isSimple()) return false; StoredValue = SI->getValueOperand(); - } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) { - assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); - StoredValue = VPStore->getArgOperand(0); } else { - llvm_unreachable("unsupported store operation"); + assert(II->getIntrinsicID() == Intrinsic::vp_store || + II->getIntrinsicID() == Intrinsic::masked_store); + StoredValue = II->getArgOperand(0); } auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue); @@ -508,18 +531,18 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); Value *Mask = nullptr; - if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) { + if (SI) { + LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); + } else { + // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - Mask = getMask(VPStore->getMaskParam(), Factor, + Mask = getMask(getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store - << "\n"); - - } else { - LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " + << *Store << "\n"); } // Try to create target specific intrinsics to replace the store and @@ -564,6 +587,27 @@ static Value *getMask(Value *WideMask, unsigned Factor, } } + if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) { + // Check that the shuffle mask is: a) an interleave, b) all of the same + // set of the elements, and c) contained by the first source. (c) could + // be relaxed if desired. + unsigned NumSrcElts = + cast<FixedVectorType>(SVI->getOperand(1)->getType())->getNumElements(); + SmallVector<unsigned> StartIndexes; + if (ShuffleVectorInst::isInterleaveMask(SVI->getShuffleMask(), Factor, + NumSrcElts * 2, StartIndexes) && + llvm::all_of(StartIndexes, [](unsigned Start) { return Start == 0; }) && + llvm::all_of(SVI->getShuffleMask(), [&NumSrcElts](int Idx) { + return Idx < (int)NumSrcElts; + })) { + auto *LeafMaskTy = + VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC); + IRBuilder<> Builder(SVI); + return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), + uint64_t(0)); + } + } + return nullptr; } @@ -590,21 +634,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( << " and factor = " << Factor << "\n"); } else { assert(II); - - // Check mask operand. Handle both all-true/false and interleaved mask. - Value *WideMask; - switch (II->getIntrinsicID()) { - default: + if (II->getIntrinsicID() != Intrinsic::masked_load && + II->getIntrinsicID() != Intrinsic::vp_load) return false; - case Intrinsic::vp_load: - WideMask = II->getOperand(1); - break; - case Intrinsic::masked_load: - WideMask = II->getOperand(2); - break; - } - Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI)); + // Check mask operand. Handle both all-true/false and interleaved mask. + Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; @@ -641,19 +676,11 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( Value *Mask = nullptr; if (II) { - // Check mask operand. Handle both all-true/false and interleaved mask. - Value *WideMask; - switch (II->getIntrinsicID()) { - default: + if (II->getIntrinsicID() != Intrinsic::masked_store && + II->getIntrinsicID() != Intrinsic::vp_store) return false; - case Intrinsic::vp_store: - WideMask = II->getOperand(2); - break; - case Intrinsic::masked_store: - WideMask = II->getOperand(3); - break; - } - Mask = getMask(WideMask, Factor, + // Check mask operand. Handle both all-true/false and interleaved mask. + Mask = getMask(getMaskOperand(II), Factor, cast<VectorType>(InterleaveValues[0]->getType())); if (!Mask) return false; @@ -687,11 +714,13 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { using namespace PatternMatch; for (auto &I : instructions(F)) { if (match(&I, m_CombineOr(m_Load(m_Value()), - m_Intrinsic<Intrinsic::vp_load>()))) + m_Intrinsic<Intrinsic::vp_load>())) || + match(&I, m_Intrinsic<Intrinsic::masked_load>())) Changed |= lowerInterleavedLoad(&I, DeadInsts); if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()), - m_Intrinsic<Intrinsic::vp_store>()))) + m_Intrinsic<Intrinsic::vp_store>())) || + match(&I, m_Intrinsic<Intrinsic::masked_store>())) Changed |= lowerInterleavedStore(&I, DeadInsts); if (auto *II = dyn_cast<IntrinsicInst>(&I)) { diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 7710b50..bc4e299 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -815,6 +815,9 @@ static void printMI(raw_ostream &OS, MFPrintState &State, if (MI.getFlag(MachineInstr::SameSign)) OS << "samesign "; + // NOTE: Please add new MIFlags also to the MI_FLAGS_STR in + // llvm/utils/update_mir_test_checks.py. + OS << TII->getName(MI.getOpcode()); LS = ListSeparator(); diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index 44b648a..4da0184 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -83,27 +83,6 @@ llvm::createUnpackMachineBundles( return new UnpackMachineBundles(std::move(Ftor)); } -namespace { - class FinalizeMachineBundles : public MachineFunctionPass { - public: - static char ID; // Pass identification - FinalizeMachineBundles() : MachineFunctionPass(ID) { - initializeFinalizeMachineBundlesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - }; -} // end anonymous namespace - -char FinalizeMachineBundles::ID = 0; -char &llvm::FinalizeMachineBundlesID = FinalizeMachineBundles::ID; -INITIALIZE_PASS(FinalizeMachineBundles, "finalize-mi-bundles", - "Finalize machine instruction bundles", false, false) - -bool FinalizeMachineBundles::runOnMachineFunction(MachineFunction &MF) { - return llvm::finalizeBundles(MF); -} - /// Return the first found DebugLoc that has a DILocation, given a range of /// instructions. The search range is from FirstMI to LastMI (exclusive). If no /// DILocation is found, then an empty location is returned. diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index e144111..286fbfd 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -49,7 +49,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <algorithm> #include <cassert> #include <limits> #include <vector> diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index e5704c0..583a85a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" @@ -357,6 +358,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::PATCHPOINT: Res = PromoteIntRes_PATCHPOINT(N); break; + case ISD::READ_REGISTER: + Res = PromoteIntRes_READ_REGISTER(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -2076,6 +2080,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::PATCHPOINT: Res = PromoteIntOp_PATCHPOINT(N, OpNo); break; + case ISD::WRITE_REGISTER: + Res = PromoteIntOp_WRITE_REGISTER(N, OpNo); + break; case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: case ISD::EXPERIMENTAL_VP_STRIDED_STORE: Res = PromoteIntOp_VP_STRIDED(N, OpNo); @@ -2853,6 +2860,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) { return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_WRITE_REGISTER(SDNode *N, + unsigned OpNo) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.write_register with illegal type", Fn, + N->getDebugLoc())); + return N->getOperand(0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) { assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) || (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4)); @@ -3127,6 +3143,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::VSCALE: ExpandIntRes_VSCALE(N, Lo, Hi); break; + + case ISD::READ_REGISTER: + ExpandIntRes_READ_REGISTER(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -5471,6 +5491,18 @@ void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo, SplitInteger(Res, Lo, Hi); } +void DAGTypeLegalizer::ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, + SDValue &Hi) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc())); + ReplaceValueWith(SDValue(N, 1), N->getOperand(0)); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + Lo = DAG.getPOISON(LoVT); + Hi = DAG.getPOISON(HiVT); +} + //===----------------------------------------------------------------------===// // Integer Operand Expansion //===----------------------------------------------------------------------===// @@ -5537,6 +5569,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::EXPERIMENTAL_VP_STRIDED_STORE: Res = ExpandIntOp_VP_STRIDED(N, OpNo); break; + case ISD::WRITE_REGISTER: + Res = ExpandIntOp_WRITE_REGISTER(N, OpNo); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -5935,6 +5970,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) { return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.write_register with illegal type", Fn, + N->getDebugLoc())); + + return N->getOperand(0); +} + SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) { SDLoc dl(N); @@ -6332,6 +6376,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_PATCHPOINT(SDNode *N) { return Res.getValue(0); } +SDValue DAGTypeLegalizer::PromoteIntRes_READ_REGISTER(SDNode *N) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure( + "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc())); + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + ReplaceValueWith(SDValue(N, 1), N->getOperand(0)); + return DAG.getPOISON(NVT); +} + SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9b53724..2e13b18 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -378,6 +378,7 @@ private: SDValue PromoteIntRes_VPFunnelShift(SDNode *N); SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); + SDValue PromoteIntRes_READ_REGISTER(SDNode *N); SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); @@ -428,6 +429,7 @@ private: SDValue PromoteIntOp_SET_ROUNDING(SDNode *N); SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); @@ -511,6 +513,7 @@ private: void ExpandIntRes_FunnelShift (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_VSCALE (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandShiftByConstant(SDNode *N, const APInt &Amt, SDValue &Lo, SDValue &Hi); @@ -534,6 +537,7 @@ private: SDValue ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo); SDValue ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo); SDValue ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo); + SDValue ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo); void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &dl); diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index b79911b..2a8234a 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -588,7 +588,14 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F, continue; Instruction *CheckLoc = dyn_cast<ReturnInst>(BB.getTerminator()); if (!CheckLoc && !DisableCheckNoReturn) - for (auto &Inst : BB) + for (auto &Inst : BB) { + if (IntrinsicInst *IB = dyn_cast<IntrinsicInst>(&Inst); + IB && (IB->getIntrinsicID() == Intrinsic::eh_sjlj_callsite)) { + // eh_sjlj_callsite has to be in same BB as the + // bb terminator. Don't insert within this range. + CheckLoc = IB; + break; + } if (auto *CB = dyn_cast<CallBase>(&Inst)) // Do stack check before noreturn calls that aren't nounwind (e.g: // __cxa_throw). @@ -596,6 +603,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F, CheckLoc = CB; break; } + } if (!CheckLoc) continue; diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 7e501a9..408d07b 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -42,7 +42,6 @@ #include "llvm/IR/Mangler.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include "llvm/IR/PseudoProbe.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmInfoDarwin.h" @@ -996,7 +995,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForLSDA( if (!LSDASection || (!F.hasComdat() && !TM.getFunctionSections())) return LSDASection; - const auto *LSDA = cast<MCSectionELF>(LSDASection); + const auto *LSDA = static_cast<const MCSectionELF *>(LSDASection); unsigned Flags = LSDA->getFlags(); const MCSymbolELF *LinkedToSym = nullptr; StringRef Group; @@ -2055,14 +2054,14 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection( unsigned Priority, const MCSymbol *KeySym) const { return getCOFFStaticStructorSection( getContext(), getContext().getTargetTriple(), true, Priority, KeySym, - cast<MCSectionCOFF>(StaticCtorSection)); + static_cast<MCSectionCOFF *>(StaticCtorSection)); } MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection( unsigned Priority, const MCSymbol *KeySym) const { return getCOFFStaticStructorSection( getContext(), getContext().getTargetTriple(), false, Priority, KeySym, - cast<MCSectionCOFF>(StaticDtorSection)); + static_cast<MCSectionCOFF *>(StaticDtorSection)); } const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference( @@ -2389,23 +2388,25 @@ TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV, // here. if (const GlobalObject *GO = dyn_cast<GlobalObject>(GV)) { if (GO->isDeclarationForLinker()) - return cast<MCSectionXCOFF>(getSectionForExternalReference(GO, TM)) + return static_cast<const MCSectionXCOFF *>( + getSectionForExternalReference(GO, TM)) ->getQualNameSymbol(); if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) if (GVar->hasAttribute("toc-data")) - return cast<MCSectionXCOFF>( + return static_cast<const MCSectionXCOFF *>( SectionForGlobal(GVar, SectionKind::getData(), TM)) ->getQualNameSymbol(); SectionKind GOKind = getKindForGlobal(GO, TM); if (GOKind.isText()) - return cast<MCSectionXCOFF>( + return static_cast<const MCSectionXCOFF *>( getSectionForFunctionDescriptor(cast<Function>(GO), TM)) ->getQualNameSymbol(); if ((TM.getDataSections() && !GO->hasSection()) || GO->hasCommonLinkage() || GOKind.isBSSLocal() || GOKind.isThreadBSSLocal()) - return cast<MCSectionXCOFF>(SectionForGlobal(GO, GOKind, TM)) + return static_cast<const MCSectionXCOFF *>( + SectionForGlobal(GO, GOKind, TM)) ->getQualNameSymbol(); } @@ -2741,7 +2742,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry( MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA( const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const { - auto *LSDA = cast<MCSectionXCOFF>(LSDASection); + auto *LSDA = static_cast<MCSectionXCOFF *>(LSDASection); if (TM.getFunctionSections()) { // If option -ffunction-sections is on, append the function name to the // name of the LSDA csect so that each function has its own LSDA csect. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp index 2abab02..4d879b6 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp @@ -8,12 +8,9 @@ #include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h" #include "llvm/DebugInfo/DIContext.h" -#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" #include "llvm/DebugInfo/DWARF/DWARFExpressionPrinter.h" #include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h" #include "llvm/Support/Compiler.h" -#include "llvm/Support/DataExtractor.h" -#include "llvm/Support/Errc.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp index 7072418..9a7f7d1 100644 --- a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp +++ b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFExpression.cpp @@ -7,8 +7,6 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/DWARF/LowLevel/DWARFExpression.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/Support/Format.h" #include <cassert> #include <cstdint> #include <vector> diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index b3798f1..a8559e7 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -183,7 +183,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { std::lock_guard<sys::Mutex> locked(lock); // Save information about our target - Arch = (Triple::ArchType)Obj.getArch(); + Arch = Obj.getArch(); IsTargetLittleEndian = Obj.isLittleEndian(); setMipsABI(Obj); @@ -1361,18 +1361,17 @@ std::unique_ptr<RuntimeDyld::LoadedObjectInfo> RuntimeDyld::loadObject(const ObjectFile &Obj) { if (!Dyld) { if (Obj.isELF()) - Dyld = - createRuntimeDyldELF(static_cast<Triple::ArchType>(Obj.getArch()), - MemMgr, Resolver, ProcessAllSections, - std::move(NotifyStubEmitted)); + Dyld = createRuntimeDyldELF(Obj.getArch(), MemMgr, Resolver, + ProcessAllSections, + std::move(NotifyStubEmitted)); else if (Obj.isMachO()) - Dyld = createRuntimeDyldMachO( - static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver, - ProcessAllSections, std::move(NotifyStubEmitted)); + Dyld = createRuntimeDyldMachO(Obj.getArch(), MemMgr, Resolver, + ProcessAllSections, + std::move(NotifyStubEmitted)); else if (Obj.isCOFF()) - Dyld = createRuntimeDyldCOFF( - static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver, - ProcessAllSections, std::move(NotifyStubEmitted)); + Dyld = createRuntimeDyldCOFF(Obj.getArch(), MemMgr, Resolver, + ProcessAllSections, + std::move(NotifyStubEmitted)); else report_fatal_error("Incompatible object format!"); } diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index b79f6ec..ce35a5b 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1360,6 +1360,12 @@ void Pattern::printFuzzyMatch(const SourceMgr &SM, StringRef Buffer, size_t Best = StringRef::npos; double BestQuality = 0; + // Arbitrarily limit quadratic search behavior stemming from long CHECK lines. + if (size_t(4096) * size_t(2048) < + std::min(size_t(4096), Buffer.size()) * + std::max(FixedStr.size(), RegExStr.size())) + return; + // Use an arbitrary 4k limit on how far we will search. for (size_t i = 0, e = std::min(size_t(4096), Buffer.size()); i != e; ++i) { if (Buffer[i] == '\n') diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index f7669f0..53f5934 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Frontend/HLSL/RootSignatureMetadata.h" +#include "llvm/Frontend/HLSL/RootSignatureValidations.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/ScopedPrinter.h" @@ -20,6 +22,42 @@ namespace llvm { namespace hlsl { namespace rootsig { +static std::optional<uint32_t> extractMdIntValue(MDNode *Node, + unsigned int OpId) { + if (auto *CI = + mdconst::dyn_extract<ConstantInt>(Node->getOperand(OpId).get())) + return CI->getZExtValue(); + return std::nullopt; +} + +static std::optional<float> extractMdFloatValue(MDNode *Node, + unsigned int OpId) { + if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get())) + return CI->getValueAPF().convertToFloat(); + return std::nullopt; +} + +static std::optional<StringRef> extractMdStringValue(MDNode *Node, + unsigned int OpId) { + MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId)); + if (NodeText == nullptr) + return std::nullopt; + return NodeText->getString(); +} + +static bool reportError(LLVMContext *Ctx, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); + return true; +} + +static bool reportValueError(LLVMContext *Ctx, Twine ParamName, + uint32_t Value) { + Ctx->diagnose(DiagnosticInfoGeneric( + "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error)); + return true; +} + static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = { {"CBV", dxil::ResourceClass::CBuffer}, {"SRV", dxil::ResourceClass::SRV}, @@ -189,6 +227,442 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) { return MDNode::get(Ctx, Operands); } +bool MetadataParser::parseRootFlags(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *RootFlagNode) { + + if (RootFlagNode->getNumOperands() != 2) + return reportError(Ctx, "Invalid format for RootFlag Element"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1)) + RSD.Flags = *Val; + else + return reportError(Ctx, "Invalid value for RootFlag"); + + return false; +} + +bool MetadataParser::parseRootConstants(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *RootConstantNode) { + + if (RootConstantNode->getNumOperands() != 5) + return reportError(Ctx, "Invalid format for RootConstants Element"); + + dxbc::RTS0::v1::RootParameterHeader Header; + // The parameter offset doesn't matter here - we recalculate it during + // serialization Header.ParameterOffset = 0; + Header.ParameterType = + llvm::to_underlying(dxbc::RootParameterType::Constants32Bit); + + if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1)) + Header.ShaderVisibility = *Val; + else + return reportError(Ctx, "Invalid value for ShaderVisibility"); + + dxbc::RTS0::v1::RootConstants Constants; + if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2)) + Constants.ShaderRegister = *Val; + else + return reportError(Ctx, "Invalid value for ShaderRegister"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3)) + Constants.RegisterSpace = *Val; + else + return reportError(Ctx, "Invalid value for RegisterSpace"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4)) + Constants.Num32BitValues = *Val; + else + return reportError(Ctx, "Invalid value for Num32BitValues"); + + RSD.ParametersContainer.addParameter(Header, Constants); + + return false; +} + +bool MetadataParser::parseRootDescriptors( + LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, + MDNode *RootDescriptorNode, RootSignatureElementKind ElementKind) { + assert(ElementKind == RootSignatureElementKind::SRV || + ElementKind == RootSignatureElementKind::UAV || + ElementKind == RootSignatureElementKind::CBV && + "parseRootDescriptors should only be called with RootDescriptor " + "element kind."); + if (RootDescriptorNode->getNumOperands() != 5) + return reportError(Ctx, "Invalid format for Root Descriptor Element"); + + dxbc::RTS0::v1::RootParameterHeader Header; + switch (ElementKind) { + case RootSignatureElementKind::SRV: + Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV); + break; + case RootSignatureElementKind::UAV: + Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV); + break; + case RootSignatureElementKind::CBV: + Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV); + break; + default: + llvm_unreachable("invalid Root Descriptor kind"); + break; + } + + if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1)) + Header.ShaderVisibility = *Val; + else + return reportError(Ctx, "Invalid value for ShaderVisibility"); + + dxbc::RTS0::v2::RootDescriptor Descriptor; + if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2)) + Descriptor.ShaderRegister = *Val; + else + return reportError(Ctx, "Invalid value for ShaderRegister"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3)) + Descriptor.RegisterSpace = *Val; + else + return reportError(Ctx, "Invalid value for RegisterSpace"); + + if (RSD.Version == 1) { + RSD.ParametersContainer.addParameter(Header, Descriptor); + return false; + } + assert(RSD.Version > 1); + + if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4)) + Descriptor.Flags = *Val; + else + return reportError(Ctx, "Invalid value for Root Descriptor Flags"); + + RSD.ParametersContainer.addParameter(Header, Descriptor); + return false; +} + +bool MetadataParser::parseDescriptorRange(LLVMContext *Ctx, + mcdxbc::DescriptorTable &Table, + MDNode *RangeDescriptorNode) { + + if (RangeDescriptorNode->getNumOperands() != 6) + return reportError(Ctx, "Invalid format for Descriptor Range"); + + dxbc::RTS0::v2::DescriptorRange Range; + + std::optional<StringRef> ElementText = + extractMdStringValue(RangeDescriptorNode, 0); + + if (!ElementText.has_value()) + return reportError(Ctx, "Descriptor Range, first element is not a string."); + + Range.RangeType = + StringSwitch<uint32_t>(*ElementText) + .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV)) + .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV)) + .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV)) + .Case("Sampler", + llvm::to_underlying(dxbc::DescriptorRangeType::Sampler)) + .Default(~0U); + + if (Range.RangeType == ~0U) + return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText); + + if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1)) + Range.NumDescriptors = *Val; + else + return reportError(Ctx, "Invalid value for Number of Descriptor in Range"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2)) + Range.BaseShaderRegister = *Val; + else + return reportError(Ctx, "Invalid value for BaseShaderRegister"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3)) + Range.RegisterSpace = *Val; + else + return reportError(Ctx, "Invalid value for RegisterSpace"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4)) + Range.OffsetInDescriptorsFromTableStart = *Val; + else + return reportError(Ctx, + "Invalid value for OffsetInDescriptorsFromTableStart"); + + if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5)) + Range.Flags = *Val; + else + return reportError(Ctx, "Invalid value for Descriptor Range Flags"); + + Table.Ranges.push_back(Range); + return false; +} + +bool MetadataParser::parseDescriptorTable(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *DescriptorTableNode) { + const unsigned int NumOperands = DescriptorTableNode->getNumOperands(); + if (NumOperands < 2) + return reportError(Ctx, "Invalid format for Descriptor Table"); + + dxbc::RTS0::v1::RootParameterHeader Header; + if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1)) + Header.ShaderVisibility = *Val; + else + return reportError(Ctx, "Invalid value for ShaderVisibility"); + + mcdxbc::DescriptorTable Table; + Header.ParameterType = + llvm::to_underlying(dxbc::RootParameterType::DescriptorTable); + + for (unsigned int I = 2; I < NumOperands; I++) { + MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I)); + if (Element == nullptr) + return reportError(Ctx, "Missing Root Element Metadata Node."); + + if (parseDescriptorRange(Ctx, Table, Element)) + return true; + } + + RSD.ParametersContainer.addParameter(Header, Table); + return false; +} + +bool MetadataParser::parseStaticSampler(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *StaticSamplerNode) { + if (StaticSamplerNode->getNumOperands() != 14) + return reportError(Ctx, "Invalid format for Static Sampler"); + + dxbc::RTS0::v1::StaticSampler Sampler; + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1)) + Sampler.Filter = *Val; + else + return reportError(Ctx, "Invalid value for Filter"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2)) + Sampler.AddressU = *Val; + else + return reportError(Ctx, "Invalid value for AddressU"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3)) + Sampler.AddressV = *Val; + else + return reportError(Ctx, "Invalid value for AddressV"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4)) + Sampler.AddressW = *Val; + else + return reportError(Ctx, "Invalid value for AddressW"); + + if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5)) + Sampler.MipLODBias = *Val; + else + return reportError(Ctx, "Invalid value for MipLODBias"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6)) + Sampler.MaxAnisotropy = *Val; + else + return reportError(Ctx, "Invalid value for MaxAnisotropy"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7)) + Sampler.ComparisonFunc = *Val; + else + return reportError(Ctx, "Invalid value for ComparisonFunc "); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8)) + Sampler.BorderColor = *Val; + else + return reportError(Ctx, "Invalid value for ComparisonFunc "); + + if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9)) + Sampler.MinLOD = *Val; + else + return reportError(Ctx, "Invalid value for MinLOD"); + + if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10)) + Sampler.MaxLOD = *Val; + else + return reportError(Ctx, "Invalid value for MaxLOD"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11)) + Sampler.ShaderRegister = *Val; + else + return reportError(Ctx, "Invalid value for ShaderRegister"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12)) + Sampler.RegisterSpace = *Val; + else + return reportError(Ctx, "Invalid value for RegisterSpace"); + + if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13)) + Sampler.ShaderVisibility = *Val; + else + return reportError(Ctx, "Invalid value for ShaderVisibility"); + + RSD.StaticSamplers.push_back(Sampler); + return false; +} + +bool MetadataParser::parseRootSignatureElement(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *Element) { + std::optional<StringRef> ElementText = extractMdStringValue(Element, 0); + if (!ElementText.has_value()) + return reportError(Ctx, "Invalid format for Root Element"); + + RootSignatureElementKind ElementKind = + StringSwitch<RootSignatureElementKind>(*ElementText) + .Case("RootFlags", RootSignatureElementKind::RootFlags) + .Case("RootConstants", RootSignatureElementKind::RootConstants) + .Case("RootCBV", RootSignatureElementKind::CBV) + .Case("RootSRV", RootSignatureElementKind::SRV) + .Case("RootUAV", RootSignatureElementKind::UAV) + .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable) + .Case("StaticSampler", RootSignatureElementKind::StaticSamplers) + .Default(RootSignatureElementKind::Error); + + switch (ElementKind) { + + case RootSignatureElementKind::RootFlags: + return parseRootFlags(Ctx, RSD, Element); + case RootSignatureElementKind::RootConstants: + return parseRootConstants(Ctx, RSD, Element); + case RootSignatureElementKind::CBV: + case RootSignatureElementKind::SRV: + case RootSignatureElementKind::UAV: + return parseRootDescriptors(Ctx, RSD, Element, ElementKind); + case RootSignatureElementKind::DescriptorTable: + return parseDescriptorTable(Ctx, RSD, Element); + case RootSignatureElementKind::StaticSamplers: + return parseStaticSampler(Ctx, RSD, Element); + case RootSignatureElementKind::Error: + return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText); + } + + llvm_unreachable("Unhandled RootSignatureElementKind enum."); +} + +bool MetadataParser::validateRootSignature( + LLVMContext *Ctx, const llvm::mcdxbc::RootSignatureDesc &RSD) { + if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) { + return reportValueError(Ctx, "Version", RSD.Version); + } + + if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) { + return reportValueError(Ctx, "RootFlags", RSD.Flags); + } + + for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) { + if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility)) + return reportValueError(Ctx, "ShaderVisibility", + Info.Header.ShaderVisibility); + + assert(dxbc::isValidParameterType(Info.Header.ParameterType) && + "Invalid value for ParameterType"); + + switch (Info.Header.ParameterType) { + + case llvm::to_underlying(dxbc::RootParameterType::CBV): + case llvm::to_underlying(dxbc::RootParameterType::UAV): + case llvm::to_underlying(dxbc::RootParameterType::SRV): { + const dxbc::RTS0::v2::RootDescriptor &Descriptor = + RSD.ParametersContainer.getRootDescriptor(Info.Location); + if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister)) + return reportValueError(Ctx, "ShaderRegister", + Descriptor.ShaderRegister); + + if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace)) + return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace); + + if (RSD.Version > 1) { + if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version, + Descriptor.Flags)) + return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags); + } + break; + } + case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): { + const mcdxbc::DescriptorTable &Table = + RSD.ParametersContainer.getDescriptorTable(Info.Location); + for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) { + if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType)) + return reportValueError(Ctx, "RangeType", Range.RangeType); + + if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace)) + return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace); + + if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors)) + return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors); + + if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag( + RSD.Version, Range.RangeType, Range.Flags)) + return reportValueError(Ctx, "DescriptorFlag", Range.Flags); + } + break; + } + } + } + + for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) { + if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter)) + return reportValueError(Ctx, "Filter", Sampler.Filter); + + if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU)) + return reportValueError(Ctx, "AddressU", Sampler.AddressU); + + if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV)) + return reportValueError(Ctx, "AddressV", Sampler.AddressV); + + if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW)) + return reportValueError(Ctx, "AddressW", Sampler.AddressW); + + if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias)) + return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias); + + if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy)) + return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy); + + if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc)) + return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc); + + if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor)) + return reportValueError(Ctx, "BorderColor", Sampler.BorderColor); + + if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD)) + return reportValueError(Ctx, "MinLOD", Sampler.MinLOD); + + if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD)) + return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD); + + if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister)) + return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister); + + if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace)) + return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace); + + if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility)) + return reportValueError(Ctx, "ShaderVisibility", + Sampler.ShaderVisibility); + } + + return false; +} + +bool MetadataParser::ParseRootSignature(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD) { + bool HasError = false; + + // Loop through the Root Elements of the root signature. + for (const auto &Operand : Root->operands()) { + MDNode *Element = dyn_cast<MDNode>(Operand); + if (Element == nullptr) + return reportError(Ctx, "Missing Root Element Metadata Node."); + + HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element) || + validateRootSignature(Ctx, RSD); + } + + return HasError; +} } // namespace rootsig } // namespace hlsl } // namespace llvm diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 28ed1e5..7159107 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1450,6 +1450,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Case("popc.ll", true) .Case("h2f", true) .Case("swap.lo.hi.b64", true) + .Case("tanh.approx.f32", true) .Default(false); if (Expand) { @@ -2543,6 +2544,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, MDNode *MD = MDNode::get(Builder.getContext(), {}); LD->setMetadata(LLVMContext::MD_invariant_load, MD); return LD; + } else if (Name == "tanh.approx.f32") { + // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32 + FastMathFlags FMF; + FMF.setApproxFunc(); + Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0), + FMF); } else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") { Value *Arg = Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0); diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp index b94dcac..4f37624 100644 --- a/llvm/lib/IR/DiagnosticInfo.cpp +++ b/llvm/lib/IR/DiagnosticInfo.cpp @@ -81,6 +81,10 @@ void DiagnosticInfoInlineAsm::print(DiagnosticPrinter &DP) const { DP << " at line " << getLocCookie(); } +void DiagnosticInfoLegalizationFailure::print(DiagnosticPrinter &DP) const { + DP << getLocationStr() << ": " << getMsgStr(); +} + DiagnosticInfoRegAllocFailure::DiagnosticInfoRegAllocFailure( const Twine &MsgStr, const Function &Fn, const DiagnosticLocation &DL, DiagnosticSeverity Severity) diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 5e1bf28..9c34662 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -304,14 +304,12 @@ IntegerType *Type::getIntNTy(LLVMContext &C, unsigned N) { Type *Type::getWasm_ExternrefTy(LLVMContext &C) { // opaque pointer in addrspace(10) - static PointerType *Ty = PointerType::get(C, 10); - return Ty; + return PointerType::get(C, 10); } Type *Type::getWasm_FuncrefTy(LLVMContext &C) { // opaque pointer in addrspace(20) - static PointerType *Ty = PointerType::get(C, 20); - return Ty; + return PointerType::get(C, 20); } //===----------------------------------------------------------------------===// @@ -324,12 +322,12 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) { // Check for the built-in integer types switch (NumBits) { - case 1: return cast<IntegerType>(Type::getInt1Ty(C)); - case 8: return cast<IntegerType>(Type::getInt8Ty(C)); - case 16: return cast<IntegerType>(Type::getInt16Ty(C)); - case 32: return cast<IntegerType>(Type::getInt32Ty(C)); - case 64: return cast<IntegerType>(Type::getInt64Ty(C)); - case 128: return cast<IntegerType>(Type::getInt128Ty(C)); + case 1: return Type::getInt1Ty(C); + case 8: return Type::getInt8Ty(C); + case 16: return Type::getInt16Ty(C); + case 32: return Type::getInt32Ty(C); + case 64: return Type::getInt64Ty(C); + case 128: return Type::getInt128Ty(C); default: break; } diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index d662c42..18a85b3 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -43,13 +43,7 @@ add_llvm_component_library(LLVMMC MCRegisterInfo.cpp MCSchedule.cpp MCSection.cpp - MCSectionCOFF.cpp - MCSectionDXContainer.cpp - MCSectionELF.cpp - MCSectionGOFF.cpp MCSectionMachO.cpp - MCSectionWasm.cpp - MCSectionXCOFF.cpp MCStreamer.cpp MCSPIRVStreamer.cpp MCSubtargetInfo.cpp diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 9f52b3e..ae8dffc 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -559,20 +559,7 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) { } else { const MCSectionELF &Section = static_cast<const MCSectionELF &>(Symbol.getSection()); - - // We may end up with a situation when section symbol is technically - // defined, but should not be. That happens because we explicitly - // pre-create few .debug_* sections to have accessors. - // And if these sections were not really defined in the code, but were - // referenced, we simply error out. - if (!Section.isRegistered()) { - assert(static_cast<const MCSymbolELF &>(Symbol).getType() == - ELF::STT_SECTION); - Ctx.reportError(SMLoc(), - "Undefined section reference: " + Symbol.getName()); - continue; - } - + assert(Section.isRegistered()); if (Mode == NonDwoOnly && isDwoSection(Section)) continue; MSD.SectionIndex = Section.getOrdinal(); @@ -1100,7 +1087,8 @@ uint64_t ELFWriter::writeObject() { // Remember the offset into the file for this section. const uint64_t SecStart = align(RelSection->getAlign()); - writeRelocations(cast<MCSectionELF>(*RelSection->getLinkedToSection())); + writeRelocations( + static_cast<const MCSectionELF &>(*RelSection->getLinkedToSection())); uint64_t SecEnd = W.OS.tell(); RelSection->setOffsets(SecStart, SecEnd); @@ -1273,7 +1261,7 @@ bool ELFObjectWriter::useSectionSymbol(const MCValue &Val, // that it pointed to another string and subtracting 42 at runtime will // produce the wrong value. if (Sym->isInSection()) { - auto &Sec = cast<MCSectionELF>(Sym->getSection()); + auto &Sec = static_cast<const MCSectionELF &>(Sym->getSection()); unsigned Flags = Sec.getFlags(); if (Flags & ELF::SHF_MERGE) { if (C != 0) @@ -1325,13 +1313,14 @@ bool ELFObjectWriter::checkRelocation(SMLoc Loc, const MCSectionELF *From, void ELFObjectWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) { - const MCSectionELF &Section = cast<MCSectionELF>(*F.getParent()); + auto &Section = static_cast<const MCSectionELF &>(*F.getParent()); MCContext &Ctx = getContext(); const auto *SymA = cast_or_null<MCSymbolELF>(Target.getAddSym()); - const MCSectionELF *SecA = (SymA && SymA->isInSection()) - ? cast<MCSectionELF>(&SymA->getSection()) - : nullptr; + const MCSectionELF *SecA = + (SymA && SymA->isInSection()) + ? static_cast<const MCSectionELF *>(&SymA->getSection()) + : nullptr; if (DwoOS && !checkRelocation(Fixup.getLoc(), &Section, SecA)) return; diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 1871f5f..88188f3 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -336,7 +336,7 @@ void GOFFWriter::defineSymbols() { unsigned Ordinal = 0; // Process all sections. for (MCSection &S : Asm) { - auto &Section = cast<MCSectionGOFF>(S); + auto &Section = static_cast<MCSectionGOFF &>(S); Section.setOrdinal(++Ordinal); defineSectionSymbols(Section); } diff --git a/llvm/lib/MC/MCAsmInfoCOFF.cpp b/llvm/lib/MC/MCAsmInfoCOFF.cpp index 0b8781c..54717df 100644 --- a/llvm/lib/MC/MCAsmInfoCOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoCOFF.cpp @@ -12,7 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoCOFF.h" +#include "llvm/BinaryFormat/COFF.h" #include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> using namespace llvm; @@ -49,6 +55,10 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() { HasCOFFComdatConstants = true; } +bool MCAsmInfoCOFF::useCodeAlign(const MCSection &Sec) const { + return Sec.isText(); +} + void MCAsmInfoMicrosoft::anchor() {} MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default; @@ -64,3 +74,101 @@ MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() { // We don't create constants in comdat sections for MinGW. HasCOFFComdatConstants = false; } + +bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name) const { + if (COMDATSymbol || isUnique()) + return false; + + // FIXME: Does .section .bss/.data/.text work everywhere?? + if (Name == ".text" || Name == ".data" || Name == ".bss") + return true; + + return false; +} + +void MCSectionCOFF::setSelection(int Selection) const { + assert(Selection != 0 && "invalid COMDAT selection type"); + this->Selection = Selection; + Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; +} + +void MCAsmInfoCOFF::printSwitchToSection(const MCSection &Section, uint32_t, + const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionCOFF &>(Section); + // standard sections don't require the '.section' + if (Sec.shouldOmitSectionDirective(Sec.getName())) { + OS << '\t' << Sec.getName() << '\n'; + return; + } + + OS << "\t.section\t" << Sec.getName() << ",\""; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA) + OS << 'd'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) + OS << 'b'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE) + OS << 'x'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE) + OS << 'w'; + else if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_READ) + OS << 'r'; + else + OS << 'y'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE) + OS << 'n'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED) + OS << 's'; + if ((Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) && + !Sec.isImplicitlyDiscardable(Sec.getName())) + OS << 'D'; + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO) + OS << 'i'; + OS << '"'; + + // unique should be tail of .section directive. + if (Sec.isUnique() && !Sec.COMDATSymbol) + OS << ",unique," << Sec.UniqueID; + + if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) { + if (Sec.COMDATSymbol) + OS << ","; + else + OS << "\n\t.linkonce\t"; + switch (Sec.Selection) { + case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES: + OS << "one_only"; + break; + case COFF::IMAGE_COMDAT_SELECT_ANY: + OS << "discard"; + break; + case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE: + OS << "same_size"; + break; + case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH: + OS << "same_contents"; + break; + case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE: + OS << "associative"; + break; + case COFF::IMAGE_COMDAT_SELECT_LARGEST: + OS << "largest"; + break; + case COFF::IMAGE_COMDAT_SELECT_NEWEST: + OS << "newest"; + break; + default: + assert(false && "unsupported COFF selection type"); + break; + } + if (Sec.COMDATSymbol) { + OS << ","; + Sec.COMDATSymbol->print(OS, this); + } + } + + if (Sec.isUnique() && Sec.COMDATSymbol) + OS << ",unique," << Sec.UniqueID; + + OS << '\n'; +} diff --git a/llvm/lib/MC/MCAsmInfoDarwin.cpp b/llvm/lib/MC/MCAsmInfoDarwin.cpp index 9cba775..e156fa0 100644 --- a/llvm/lib/MC/MCAsmInfoDarwin.cpp +++ b/llvm/lib/MC/MCAsmInfoDarwin.cpp @@ -85,3 +85,8 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() { DwarfUsesRelocationsAcrossSections = false; SetDirectiveSuppressesReloc = true; } + +bool MCAsmInfoDarwin::useCodeAlign(const MCSection &Sec) const { + return static_cast<const MCSectionMachO &>(Sec).hasAttribute( + MachO::S_ATTR_PURE_INSTRUCTIONS); +} diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp index 7eb89ef..cdae9d7 100644 --- a/llvm/lib/MC/MCAsmInfoELF.cpp +++ b/llvm/lib/MC/MCAsmInfoELF.cpp @@ -12,9 +12,16 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoELF.h" +#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/Triple.h" +#include <cassert> using namespace llvm; @@ -28,9 +35,198 @@ MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const { return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0); } +bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const { + return static_cast<const MCSectionELF &>(Sec).getFlags() & ELF::SHF_EXECINSTR; +} + MCAsmInfoELF::MCAsmInfoELF() { HasIdentDirective = true; WeakRefDirective = "\t.weak\t"; PrivateGlobalPrefix = ".L"; PrivateLabelPrefix = ".L"; } + +static void printName(raw_ostream &OS, StringRef Name) { + if (Name.find_first_not_of("0123456789_." + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { + OS << Name; + return; + } + OS << '"'; + for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { + if (*B == '"') // Unquoted " + OS << "\\\""; + else if (*B != '\\') // Neither " or backslash + OS << *B; + else if (B + 1 == E) // Trailing backslash + OS << "\\\\"; + else { + OS << B[0] << B[1]; // Quoted character + ++B; + } + } + OS << '"'; +} + +void MCAsmInfoELF::printSwitchToSection(const MCSection &Section, + uint32_t Subsection, const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionELF &>(Section); + if (!Sec.isUnique() && shouldOmitSectionDirective(Sec.getName())) { + OS << '\t' << Sec.getName(); + if (Subsection) + OS << '\t' << Subsection; + OS << '\n'; + return; + } + + OS << "\t.section\t"; + printName(OS, Sec.getName()); + + // Handle the weird solaris syntax if desired. + if (usesSunStyleELFSectionSwitchSyntax() && !(Sec.Flags & ELF::SHF_MERGE)) { + if (Sec.Flags & ELF::SHF_ALLOC) + OS << ",#alloc"; + if (Sec.Flags & ELF::SHF_EXECINSTR) + OS << ",#execinstr"; + if (Sec.Flags & ELF::SHF_WRITE) + OS << ",#write"; + if (Sec.Flags & ELF::SHF_EXCLUDE) + OS << ",#exclude"; + if (Sec.Flags & ELF::SHF_TLS) + OS << ",#tls"; + OS << '\n'; + return; + } + + OS << ",\""; + if (Sec.Flags & ELF::SHF_ALLOC) + OS << 'a'; + if (Sec.Flags & ELF::SHF_EXCLUDE) + OS << 'e'; + if (Sec.Flags & ELF::SHF_EXECINSTR) + OS << 'x'; + if (Sec.Flags & ELF::SHF_WRITE) + OS << 'w'; + if (Sec.Flags & ELF::SHF_MERGE) + OS << 'M'; + if (Sec.Flags & ELF::SHF_STRINGS) + OS << 'S'; + if (Sec.Flags & ELF::SHF_TLS) + OS << 'T'; + if (Sec.Flags & ELF::SHF_LINK_ORDER) + OS << 'o'; + if (Sec.Flags & ELF::SHF_GROUP) + OS << 'G'; + if (Sec.Flags & ELF::SHF_GNU_RETAIN) + OS << 'R'; + + // If there are os-specific flags, print them. + if (T.isOSSolaris()) + if (Sec.Flags & ELF::SHF_SUNW_NODISCARD) + OS << 'R'; + + // If there are tarSec.get-specific flags, print them. + Triple::ArchType Arch = T.getArch(); + if (Arch == Triple::xcore) { + if (Sec.Flags & ELF::XCORE_SHF_CP_SECTION) + OS << 'c'; + if (Sec.Flags & ELF::XCORE_SHF_DP_SECTION) + OS << 'd'; + } else if (T.isARM() || T.isThumb()) { + if (Sec.Flags & ELF::SHF_ARM_PURECODE) + OS << 'y'; + } else if (T.isAArch64()) { + if (Sec.Flags & ELF::SHF_AARCH64_PURECODE) + OS << 'y'; + } else if (Arch == Triple::hexagon) { + if (Sec.Flags & ELF::SHF_HEX_GPREL) + OS << 's'; + } else if (Arch == Triple::x86_64) { + if (Sec.Flags & ELF::SHF_X86_64_LARGE) + OS << 'l'; + } + + OS << '"'; + + OS << ','; + + // If comment string is '@', e.g. as on ARM - use '%' instead + if (getCommentString()[0] == '@') + OS << '%'; + else + OS << '@'; + + if (Sec.Type == ELF::SHT_INIT_ARRAY) + OS << "init_array"; + else if (Sec.Type == ELF::SHT_FINI_ARRAY) + OS << "fini_array"; + else if (Sec.Type == ELF::SHT_PREINIT_ARRAY) + OS << "preinit_array"; + else if (Sec.Type == ELF::SHT_NOBITS) + OS << "nobits"; + else if (Sec.Type == ELF::SHT_NOTE) + OS << "note"; + else if (Sec.Type == ELF::SHT_PROGBITS) + OS << "progbits"; + else if (Sec.Type == ELF::SHT_X86_64_UNWIND) + OS << "unwind"; + else if (Sec.Type == ELF::SHT_MIPS_DWARF) + // Print hex value of the flag while we do not have + // any standard symbolic representation of the flag. + OS << "0x7000001e"; + else if (Sec.Type == ELF::SHT_LLVM_ODRTAB) + OS << "llvm_odrtab"; + else if (Sec.Type == ELF::SHT_LLVM_LINKER_OPTIONS) + OS << "llvm_linker_options"; + else if (Sec.Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE) + OS << "llvm_call_graph_profile"; + else if (Sec.Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES) + OS << "llvm_dependent_libraries"; + else if (Sec.Type == ELF::SHT_LLVM_SYMPART) + OS << "llvm_sympart"; + else if (Sec.Type == ELF::SHT_LLVM_BB_ADDR_MAP) + OS << "llvm_bb_addr_map"; + else if (Sec.Type == ELF::SHT_LLVM_OFFLOADING) + OS << "llvm_offloading"; + else if (Sec.Type == ELF::SHT_LLVM_LTO) + OS << "llvm_lto"; + else if (Sec.Type == ELF::SHT_LLVM_JT_SIZES) + OS << "llvm_jt_sizes"; + else if (Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE) + OS << "llvm_cfi_jump_table"; + else + OS << "0x" << Twine::utohexstr(Sec.Type); + + if (Sec.EntrySize) { + assert((Sec.Flags & ELF::SHF_MERGE) || + Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE); + OS << "," << Sec.EntrySize; + } + + if (Sec.Flags & ELF::SHF_LINK_ORDER) { + OS << ","; + if (Sec.LinkedToSym) + printName(OS, Sec.LinkedToSym->getName()); + else + OS << '0'; + } + + if (Sec.Flags & ELF::SHF_GROUP) { + OS << ","; + printName(OS, Sec.Group.getPointer()->getName()); + if (Sec.isComdat()) + OS << ",comdat"; + } + + if (Sec.isUnique()) + OS << ",unique," << Sec.UniqueID; + + OS << '\n'; + + if (Subsection) { + OS << "\t.subsection\t" << Subsection; + OS << '\n'; + } +} diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp index 3c81a46..0a5d1927 100644 --- a/llvm/lib/MC/MCAsmInfoGOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp @@ -13,11 +13,12 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoGOFF.h" +#include "llvm/BinaryFormat/GOFF.h" +#include "llvm/MC/MCSectionGOFF.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; -void MCAsmInfoGOFF::anchor() {} - MCAsmInfoGOFF::MCAsmInfoGOFF() { Data64bitsDirective = "\t.quad\t"; HasDotTypeDotSizeDirective = false; @@ -25,3 +26,136 @@ MCAsmInfoGOFF::MCAsmInfoGOFF() { PrivateLabelPrefix = "L#"; ZeroDirective = "\t.space\t"; } + +static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode, + GOFF::ESDAlignment Alignment, + GOFF::ESDLoadingBehavior LoadBehavior, + GOFF::ESDExecutable Executable, bool IsReadOnly, + uint32_t SortKey, uint8_t FillByteValue, + StringRef PartName) { + OS << Name << " CATTR "; + OS << "ALIGN(" << static_cast<unsigned>(Alignment) << ")," + << "FILL(" << static_cast<unsigned>(FillByteValue) << ")"; + switch (LoadBehavior) { + case GOFF::ESD_LB_Deferred: + OS << ",DEFLOAD"; + break; + case GOFF::ESD_LB_NoLoad: + OS << ",NOLOAD"; + break; + default: + break; + } + switch (Executable) { + case GOFF::ESD_EXE_CODE: + OS << ",EXECUTABLE"; + break; + case GOFF::ESD_EXE_DATA: + OS << ",NOTEXECUTABLE"; + break; + default: + break; + } + if (IsReadOnly) + OS << ",READONLY"; + if (Rmode != GOFF::ESD_RMODE_None) { + OS << ','; + OS << "RMODE("; + switch (Rmode) { + case GOFF::ESD_RMODE_24: + OS << "24"; + break; + case GOFF::ESD_RMODE_31: + OS << "31"; + break; + case GOFF::ESD_RMODE_64: + OS << "64"; + break; + case GOFF::ESD_RMODE_None: + break; + } + OS << ')'; + } + if (SortKey) + OS << ",PRIORITY(" << SortKey << ")"; + if (!PartName.empty()) + OS << ",PART(" << PartName << ")"; + OS << '\n'; +} + +static void emitXATTR(raw_ostream &OS, StringRef Name, + GOFF::ESDLinkageType Linkage, + GOFF::ESDExecutable Executable, + GOFF::ESDBindingScope BindingScope) { + OS << Name << " XATTR "; + OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),"; + if (Executable != GOFF::ESD_EXE_Unspecified) + OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA") + << "),"; + if (BindingScope != GOFF::ESD_BSC_Unspecified) { + OS << "SCOPE("; + switch (BindingScope) { + case GOFF::ESD_BSC_Section: + OS << "SECTION"; + break; + case GOFF::ESD_BSC_Module: + OS << "MODULE"; + break; + case GOFF::ESD_BSC_Library: + OS << "LIBRARY"; + break; + case GOFF::ESD_BSC_ImportExport: + OS << "EXPORT"; + break; + default: + break; + } + OS << ')'; + } + OS << '\n'; +} + +void MCAsmInfoGOFF::printSwitchToSection(const MCSection &Section, + uint32_t Subsection, const Triple &T, + raw_ostream &OS) const { + auto &Sec = + const_cast<MCSectionGOFF &>(static_cast<const MCSectionGOFF &>(Section)); + switch (Sec.SymbolType) { + case GOFF::ESD_ST_SectionDefinition: { + OS << Sec.getName() << " CSECT\n"; + Sec.Emitted = true; + break; + } + case GOFF::ESD_ST_ElementDefinition: { + printSwitchToSection(*Sec.getParent(), Subsection, T, OS); + if (!Sec.Emitted) { + emitCATTR(OS, Sec.getName(), Sec.EDAttributes.Rmode, + Sec.EDAttributes.Alignment, Sec.EDAttributes.LoadBehavior, + GOFF::ESD_EXE_Unspecified, Sec.EDAttributes.IsReadOnly, 0, + Sec.EDAttributes.FillByteValue, StringRef()); + Sec.Emitted = true; + } else + OS << Sec.getName() << " CATTR\n"; + break; + } + case GOFF::ESD_ST_PartReference: { + MCSectionGOFF *ED = Sec.getParent(); + printSwitchToSection(*ED->getParent(), Subsection, T, OS); + if (!Sec.Emitted) { + emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode, + ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior, + Sec.PRAttributes.Executable, ED->EDAttributes.IsReadOnly, + Sec.PRAttributes.SortKey, ED->EDAttributes.FillByteValue, + Sec.getName()); + emitXATTR(OS, Sec.getName(), Sec.PRAttributes.Linkage, + Sec.PRAttributes.Executable, Sec.PRAttributes.BindingScope); + ED->Emitted = true; + Sec.Emitted = true; + } else + OS << ED->getName() << " CATTR PART(" << Sec.getName() << ")\n"; + break; + } + default: + llvm_unreachable("Wrong section type"); + } +} diff --git a/llvm/lib/MC/MCAsmInfoWasm.cpp b/llvm/lib/MC/MCAsmInfoWasm.cpp index ce6ec7e..5e44f48 100644 --- a/llvm/lib/MC/MCAsmInfoWasm.cpp +++ b/llvm/lib/MC/MCAsmInfoWasm.cpp @@ -12,9 +12,11 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAsmInfoWasm.h" -using namespace llvm; +#include "llvm/MC/MCSectionWasm.h" +#include "llvm/MC/MCSymbolWasm.h" +#include "llvm/Support/raw_ostream.h" -void MCAsmInfoWasm::anchor() {} +using namespace llvm; MCAsmInfoWasm::MCAsmInfoWasm() { HasIdentDirective = true; @@ -23,3 +25,80 @@ MCAsmInfoWasm::MCAsmInfoWasm() { PrivateGlobalPrefix = ".L"; PrivateLabelPrefix = ".L"; } + +static void printName(raw_ostream &OS, StringRef Name) { + if (Name.find_first_not_of("0123456789_." + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { + OS << Name; + return; + } + OS << '"'; + for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { + if (*B == '"') // Unquoted " + OS << "\\\""; + else if (*B != '\\') // Neither " or backslash + OS << *B; + else if (B + 1 == E) // Trailing backslash + OS << "\\\\"; + else { + OS << B[0] << B[1]; // Quoted character + ++B; + } + } + OS << '"'; +} + +void MCAsmInfoWasm::printSwitchToSection(const MCSection &Section, + uint32_t Subsection, const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionWasm &>(Section); + if (shouldOmitSectionDirective(Sec.getName())) { + OS << '\t' << Sec.getName(); + if (Subsection) + OS << '\t' << Subsection; + OS << '\n'; + return; + } + + OS << "\t.section\t"; + printName(OS, Sec.getName()); + OS << ",\""; + + if (Sec.IsPassive) + OS << 'p'; + if (Sec.Group) + OS << 'G'; + if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS) + OS << 'S'; + if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_TLS) + OS << 'T'; + if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN) + OS << 'R'; + + OS << '"'; + + OS << ','; + + // If comment string is '@', e.g. as on ARM - use '%' instead + if (getCommentString()[0] == '@') + OS << '%'; + else + OS << '@'; + + // TODO: Print section type. + + if (Sec.Group) { + OS << ","; + printName(OS, Sec.Group->getName()); + OS << ",comdat"; + } + + if (Sec.isUnique()) + OS << ",unique," << Sec.UniqueID; + + OS << '\n'; + + if (Subsection) + OS << "\t.subsection\t" << Subsection << '\n'; +} diff --git a/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/llvm/lib/MC/MCAsmInfoXCOFF.cpp index 6ef11ba..0403b44 100644 --- a/llvm/lib/MC/MCAsmInfoXCOFF.cpp +++ b/llvm/lib/MC/MCAsmInfoXCOFF.cpp @@ -8,7 +8,11 @@ #include "llvm/MC/MCAsmInfoXCOFF.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSectionXCOFF.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -16,8 +20,6 @@ namespace llvm { extern cl::opt<cl::boolOrDefault> UseLEB128Directives; } -void MCAsmInfoXCOFF::anchor() {} - MCAsmInfoXCOFF::MCAsmInfoXCOFF() { IsAIX = true; IsLittleEndian = false; @@ -56,3 +58,121 @@ bool MCAsmInfoXCOFF::isAcceptableChar(char C) const { // any combination of these. return isAlnum(C) || C == '_' || C == '.'; } + +bool MCAsmInfoXCOFF::useCodeAlign(const MCSection &Sec) const { + return static_cast<const MCSectionXCOFF &>(Sec).getKind().isText(); +} + +MCSectionXCOFF::~MCSectionXCOFF() = default; + +void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const { + OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n'; +} + +void MCAsmInfoXCOFF::printSwitchToSection(const MCSection &Section, uint32_t, + const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionXCOFF &>(Section); + if (Sec.getKind().isText()) { + if (Sec.getMappingClass() != XCOFF::XMC_PR) + report_fatal_error("Unhandled storage-mapping class for .text csect"); + + Sec.printCsectDirective(OS); + return; + } + + if (Sec.getKind().isReadOnly()) { + if (Sec.getMappingClass() != XCOFF::XMC_RO && + Sec.getMappingClass() != XCOFF::XMC_TD) + report_fatal_error("Unhandled storage-mapping class for .rodata csect."); + Sec.printCsectDirective(OS); + return; + } + + if (Sec.getKind().isReadOnlyWithRel()) { + if (Sec.getMappingClass() != XCOFF::XMC_RW && + Sec.getMappingClass() != XCOFF::XMC_RO && + Sec.getMappingClass() != XCOFF::XMC_TD) + report_fatal_error( + "Unexepected storage-mapping class for ReadOnlyWithRel kind"); + Sec.printCsectDirective(OS); + return; + } + + // Initialized TLS data. + if (Sec.getKind().isThreadData()) { + // We only expect XMC_TL here for initialized TLS data. + if (Sec.getMappingClass() != XCOFF::XMC_TL) + report_fatal_error("Unhandled storage-mapping class for .tdata csect."); + Sec.printCsectDirective(OS); + return; + } + + if (Sec.getKind().isData()) { + switch (Sec.getMappingClass()) { + case XCOFF::XMC_RW: + case XCOFF::XMC_DS: + case XCOFF::XMC_TD: + Sec.printCsectDirective(OS); + break; + case XCOFF::XMC_TC: + case XCOFF::XMC_TE: + break; + case XCOFF::XMC_TC0: + OS << "\t.toc\n"; + break; + default: + report_fatal_error("Unhandled storage-mapping class for .data csect."); + } + return; + } + + if (Sec.isCsect() && Sec.getMappingClass() == XCOFF::XMC_TD) { + // Common csect type (uninitialized storage) does not have to print + // csect directive for section switching unless it is local. + if (Sec.getKind().isCommon() && !Sec.getKind().isBSSLocal()) + return; + + assert(Sec.getKind().isBSS() && "Unexpected section kind for toc-data"); + Sec.printCsectDirective(OS); + return; + } + // Common csect type (uninitialized storage) does not have to print csect + // directive for section switching. + if (Sec.isCsect() && Sec.getCSectType() == XCOFF::XTY_CM) { + assert((Sec.getMappingClass() == XCOFF::XMC_RW || + Sec.getMappingClass() == XCOFF::XMC_BS || + Sec.getMappingClass() == XCOFF::XMC_UL) && + "Generated a storage-mapping class for a common/bss/tbss csect we " + "don't " + "understand how to switch to."); + // Common symbols and local zero-initialized symbols for TLS and Non-TLS are + // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to + // cover TLS common and zero-initialized local symbols since linkage type + // (in the GlobalVariable) is not accessible in this class. + assert((Sec.getKind().isBSSLocal() || Sec.getKind().isCommon() || + Sec.getKind().isThreadBSS()) && + "wrong symbol type for .bss/.tbss csect"); + // Don't have to print a directive for switching to section for commons + // and zero-initialized TLS data. The '.comm' and '.lcomm' directives of the + // variable will create the needed csect. + return; + } + + // Zero-initialized TLS data with weak or external linkage are not eligible to + // be put into common csect. + if (Sec.getKind().isThreadBSS()) { + Sec.printCsectDirective(OS); + return; + } + + // XCOFF debug sections. + if (Sec.getKind().isMetadata() && Sec.isDwarfSect()) { + OS << "\n\t.dwsect " << format("0x%" PRIx32, *Sec.getDwarfSubtypeFlags()) + << '\n'; + OS << Sec.getName() << ':' << '\n'; + return; + } + + report_fatal_error("Printing for this SectionKind is unimplemented."); +} diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 67c53e0..da51da4 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -345,7 +345,7 @@ public: void emitIdent(StringRef IdentString) override; void emitCFIBKeyFrame() override; void emitCFIMTETaggedFrame() override; - void emitCFISections(bool EH, bool Debug) override; + void emitCFISections(bool EH, bool Debug, bool SFrame) override; void emitCFIDefCfa(int64_t Register, int64_t Offset, SMLoc Loc) override; void emitCFIDefCfaOffset(int64_t Offset, SMLoc Loc) override; void emitCFIDefCfaRegister(int64_t Register, SMLoc Loc) override; @@ -532,8 +532,8 @@ void MCAsmStreamer::switchSection(MCSection *Section, uint32_t Subsection) { if (MCTargetStreamer *TS = getTargetStreamer()) { TS->changeSection(Cur.first, Section, Subsection, OS); } else { - Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, - Subsection); + MAI->printSwitchToSection(*Section, Subsection, + getContext().getTargetTriple(), OS); } } MCStreamer::switchSection(Section, Subsection); @@ -543,7 +543,7 @@ bool MCAsmStreamer::popSection() { if (!MCStreamer::popSection()) return false; auto [Sec, Subsec] = getCurrentSection(); - Sec->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, Subsec); + MAI->printSwitchToSection(*Sec, Subsec, getContext().getTargetTriple(), OS); return true; } @@ -1105,7 +1105,7 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol, // Note: a .zerofill directive does not switch sections. OS << ".zerofill "; - assert(Section->getVariant() == MCSection::SV_MachO && + assert(getContext().getObjectFileType() == MCContext::IsMachO && ".zerofill is a Mach-O specific directive"); // This is a mach-o specific directive. @@ -1130,7 +1130,7 @@ void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, // Instead of using the Section we'll just use the shortcut. - assert(Section->getVariant() == MCSection::SV_MachO && + assert(getContext().getObjectFileType() == MCContext::IsMachO && ".zerofill is a Mach-O specific directive"); // This is a mach-o specific directive and section. @@ -1906,15 +1906,24 @@ void MCAsmStreamer::emitIdent(StringRef IdentString) { EmitEOL(); } -void MCAsmStreamer::emitCFISections(bool EH, bool Debug) { - MCStreamer::emitCFISections(EH, Debug); +void MCAsmStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) { + MCStreamer::emitCFISections(EH, Debug, SFrame); OS << "\t.cfi_sections "; + bool C = false; if (EH) { OS << ".eh_frame"; - if (Debug) - OS << ", .debug_frame"; - } else if (Debug) { + C = true; + } + if (Debug) { + if (C) + OS << ", "; OS << ".debug_frame"; + C = true; + } + if (SFrame) { + if (C) + OS << ", "; + OS << ".sframe"; } EmitEOL(); diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 2b56e2a..8500fd1 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -8,7 +8,6 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -107,7 +106,6 @@ void MCAssembler::reset() { bool MCAssembler::registerSection(MCSection &Section) { if (Section.isRegistered()) return false; - assert(Section.curFragList()->Head && "allocInitialFragment not called"); Sections.push_back(&Section); Section.setIsRegistered(true); return true; diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 12b3fba..39bf628 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -200,16 +200,6 @@ MCInst *MCContext::createMCInst() { return new (MCInstAllocator.Allocate()) MCInst; } -// Allocate the initial MCFragment for the begin symbol. -MCFragment *MCContext::allocInitialFragment(MCSection &Sec) { - assert(!Sec.curFragList()->Head); - auto *F = allocFragment<MCFragment>(); - F->setParent(&Sec); - Sec.curFragList()->Head = F; - Sec.curFragList()->Tail = F; - return F; -} - //===----------------------------------------------------------------------===// // Symbol Manipulation //===----------------------------------------------------------------------===// @@ -443,17 +433,19 @@ MCSymbol *MCContext::getDirectionalLocalSymbol(unsigned LocalLabelVal, return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance); } +// Create a section symbol, with a distinct one for each section of the same. +// The first symbol is used for assembly code references. template <typename Symbol> Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) { Symbol *R; auto &SymEntry = getSymbolTableEntry(Section); MCSymbol *Sym = SymEntry.second.Symbol; - // A section symbol can not redefine regular symbols. There may be multiple - // sections with the same name, in which case the first such section wins. if (Sym && Sym->isDefined() && (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym)) reportError(SMLoc(), "invalid symbol redefinition"); - if (Sym && Sym->isUndefined()) { + // Use the symbol's index to track if it has been used as a section symbol. + // Set to -1 to catch potential bugs if misused as a symbol index. + if (Sym && Sym->getIndex() != -1u) { R = cast<Symbol>(Sym); } else { SymEntry.second.Used = true; @@ -461,6 +453,8 @@ Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) { if (!Sym) SymEntry.second.Symbol = R; } + // Mark as section symbol. + R->setIndex(-1u); return R; } @@ -568,7 +562,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section, MCSectionMachO(Segment, Name.substr(Name.size() - Section.size()), TypeAndAttributes, Reserved2, Kind, Begin); R.first->second = Ret; - allocInitialFragment(*Ret); return Ret; } @@ -579,15 +572,8 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type, bool Comdat, unsigned UniqueID, const MCSymbolELF *LinkedToSym) { auto *R = getOrCreateSectionSymbol<MCSymbolELF>(Section); - R->setBinding(ELF::STB_LOCAL); - R->setType(ELF::STT_SECTION); - - auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF( + return new (ELFAllocator.Allocate()) MCSectionELF( Section, Type, Flags, EntrySize, Group, Comdat, UniqueID, R, LinkedToSym); - - auto *F = allocInitialFragment(*Ret); - R->setFragment(F); - return Ret; } MCSectionELF * @@ -743,7 +729,6 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name, MCSectionGOFF(CachedName, Kind, IsVirtual, Attributes, static_cast<MCSectionGOFF *>(Parent)); Iter->second = GOFFSection; - allocInitialFragment(*GOFFSection); return GOFFSection; } @@ -782,8 +767,8 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE && COMDATSymbol->isDefined() && (!COMDATSymbol->isInSection() || - cast<MCSectionCOFF>(COMDATSymbol->getSection()).getCOMDATSymbol() != - COMDATSymbol)) + static_cast<const MCSectionCOFF &>(COMDATSymbol->getSection()) + .getCOMDATSymbol() != COMDATSymbol)) reportError(SMLoc(), "invalid symbol redefinition"); } @@ -798,8 +783,7 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF( CachedName, Characteristics, COMDATSymbol, Selection, UniqueID, Begin); Iter->second = Result; - auto *F = allocInitialFragment(*Result); - Begin->setFragment(F); + Begin->setFragment(&Result->getDummyFragment()); return Result; } @@ -870,8 +854,6 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind, MCSectionWasm(CachedName, Kind, Flags, GroupSym, UniqueID, Begin); Entry.second = Result; - auto *F = allocInitialFragment(*Result); - Begin->setFragment(F); return Result; } @@ -927,24 +909,11 @@ MCSectionXCOFF *MCContext::getXCOFFSection( MultiSymbolsAllowed); Entry.second = Result; - - auto *F = allocInitialFragment(*Result); - - // We might miss calculating the symbols difference as absolute value before - // adding fixups when symbol_A without the fragment set is the csect itself - // and symbol_B is in it. - // TODO: Currently we only set the fragment for XMC_PR csects and DWARF - // sections because we don't have other cases that hit this problem yet. - if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR) - QualName->setFragment(F); - return Result; } MCSectionSPIRV *MCContext::getSPIRVSection() { MCSectionSPIRV *Result = new (SPIRVAllocator.Allocate()) MCSectionSPIRV(); - - allocInitialFragment(*Result); return Result; } @@ -964,7 +933,6 @@ MCSectionDXContainer *MCContext::getDXContainerSection(StringRef Section, new (DXCAllocator.Allocate()) MCSectionDXContainer(Name, K, nullptr); // The first fragment will store the header - allocInitialFragment(*MapIt->second); return MapIt->second; } diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index b8cbaea5..38744a0 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -89,7 +89,9 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { getWriter().markGnuAbi(); MCObjectStreamer::changeSection(Section, Subsection); - Asm.registerSymbol(*Section->getBeginSymbol()); + auto *Sym = static_cast<MCSymbolELF *>(Section->getBeginSymbol()); + Sym->setBinding(ELF::STB_LOCAL); + Sym->setType(ELF::STT_SECTION); } void MCELFStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Target) { diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index 3c395e5..6cbdf74 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -35,7 +35,7 @@ MCFragment::MCFragment(FragmentType Kind, bool HasInstructions) } const MCSymbol *MCFragment::getAtom() const { - return cast<MCSectionMachO>(Parent)->getAtom(LayoutOrder); + return static_cast<const MCSectionMachO *>(Parent)->getAtom(LayoutOrder); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/MC/MCGOFFStreamer.cpp b/llvm/lib/MC/MCGOFFStreamer.cpp index b702191..1718e2a 100644 --- a/llvm/lib/MC/MCGOFFStreamer.cpp +++ b/llvm/lib/MC/MCGOFFStreamer.cpp @@ -26,19 +26,15 @@ GOFFObjectWriter &MCGOFFStreamer::getWriter() { return static_cast<GOFFObjectWriter &>(getAssembler().getWriter()); } -// Make sure that all section are registered in the correct order. -static void registerSectionHierarchy(MCAssembler &Asm, MCSectionGOFF *Section) { - if (Section->isRegistered()) - return; - if (Section->getParent()) - registerSectionHierarchy(Asm, Section->getParent()); - Asm.registerSection(*Section); -} - void MCGOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - registerSectionHierarchy(getAssembler(), - static_cast<MCSectionGOFF *>(Section)); - MCObjectStreamer::changeSection(Section, Subsection); + // Make sure that all section are registered in the correct order. + SmallVector<MCSectionGOFF *> Sections; + for (auto *S = static_cast<MCSectionGOFF *>(Section); S; S = S->getParent()) + Sections.push_back(S); + while (!Sections.empty()) { + auto *S = Sections.pop_back_val(); + MCObjectStreamer::changeSection(S, Sections.empty() ? Subsection : 0); + } } MCStreamer *llvm::createGOFFStreamer(MCContext &Context, diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 8f023bd..1074669 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/MachO.h" @@ -141,6 +140,8 @@ void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) { MCSymbol *Label = getContext().createLinkerPrivateTempSymbol(); Section->setBeginSymbol(Label); HasSectionLabel[Section] = true; + if (!Label->isInSection()) + emitLabel(Label); } } @@ -442,13 +443,13 @@ void MCMachOStreamer::finishImpl() { // Set the fragment atom associations by tracking the last seen atom defining // symbol. for (MCSection &Sec : getAssembler()) { - cast<MCSectionMachO>(Sec).allocAtoms(); + static_cast<MCSectionMachO &>(Sec).allocAtoms(); const MCSymbol *CurrentAtom = nullptr; size_t I = 0; for (MCFragment &Frag : Sec) { if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag)) CurrentAtom = Symbol; - cast<MCSectionMachO>(Sec).setAtom(I++, CurrentAtom); + static_cast<MCSectionMachO &>(Sec).setAtom(I++, CurrentAtom); } } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 42f4cf4..f046552 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -19,7 +19,6 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" using namespace llvm; @@ -130,10 +129,11 @@ void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) { Assembler->registerSymbol(Sym); } -void MCObjectStreamer::emitCFISections(bool EH, bool Debug) { - MCStreamer::emitCFISections(EH, Debug); +void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) { + MCStreamer::emitCFISections(EH, Debug, SFrame); EmitEHFrame = EH; EmitDebugFrame = Debug; + EmitSFrame = SFrame; } void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, @@ -185,10 +185,10 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { getAssembler().registerSymbol(*Symbol); - // If there is a current fragment, mark the symbol as pointing into it. - // Otherwise queue the label and set its fragment pointer when we emit the - // next fragment. - MCFragment *F = getCurrentFragment(); + // Set the fragment and offset. This function might be called by + // changeSection, when the section stack top hasn't been changed to the new + // section. + MCFragment *F = CurFrag; Symbol->setFragment(F); Symbol->setOffset(F->getContents().size()); @@ -247,6 +247,15 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { assert(Section && "Cannot switch to a null section!"); getContext().clearDwarfLocSeen(); + // Register the section and create an initial fragment for subsection 0 + // if `Subsection` is non-zero. + bool NewSec = getAssembler().registerSection(*Section); + MCFragment *F0 = nullptr; + if (NewSec && Subsection) { + changeSection(Section, 0); + F0 = CurFrag; + } + auto &Subsections = Section->Subsections; size_t I = 0, E = Subsections.size(); while (I != E && Subsections[I].first < Subsection) @@ -262,12 +271,13 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) { Section->CurFragList = &Subsections[I].second; CurFrag = Section->CurFragList->Tail; - getAssembler().registerSection(*Section); -} - -void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) { - MCStreamer::switchSectionNoPrint(Section); - changeSection(Section, 0); + // Define the section symbol at subsection 0's initial fragment if required. + if (!NewSec) + return; + if (auto *Sym = Section->getBeginSymbol()) { + Sym->setFragment(Subsection ? F0 : CurFrag); + getAssembler().registerSymbol(*Sym); + } } void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index d0b6ea4..9f64a98 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -3413,7 +3413,7 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) { // Check whether we should use optimal code alignment for this .align // directive. - if (Section->useCodeAlign() && !HasFillExpr) { + if (MAI.useCodeAlign(*Section) && !HasFillExpr) { getStreamer().emitCodeAlignment( Align(Alignment), &getTargetParser().getSTI(), MaxBytesToFill); } else { @@ -4093,27 +4093,30 @@ bool AsmParser::parseDirectiveCVFPOData() { } /// parseDirectiveCFISections -/// ::= .cfi_sections section [, section] +/// ::= .cfi_sections section [, section][, section] bool AsmParser::parseDirectiveCFISections() { StringRef Name; bool EH = false; bool Debug = false; + bool SFrame = false; if (!parseOptionalToken(AsmToken::EndOfStatement)) { for (;;) { if (parseIdentifier(Name)) - return TokError("expected .eh_frame or .debug_frame"); + return TokError("expected .eh_frame, .debug_frame, or .sframe"); if (Name == ".eh_frame") EH = true; else if (Name == ".debug_frame") Debug = true; + else if (Name == ".sframe") + SFrame = true; if (parseOptionalToken(AsmToken::EndOfStatement)) break; if (parseComma()) return true; } } - getStreamer().emitCFISections(EH, Debug); + getStreamer().emitCFISections(EH, Debug, SFrame); return false; } diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index c7c3df3..2e251cc 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -644,8 +644,8 @@ EndStmt: } if (UseLastGroup) { - if (const MCSectionELF *Section = - cast_or_null<MCSectionELF>(getStreamer().getCurrentSectionOnly())) + if (auto *Section = static_cast<const MCSectionELF *>( + getStreamer().getCurrentSectionOnly())) if (const MCSymbol *Group = Section->getGroup()) { GroupName = Group->getName(); IsComdat = Section->isComdat(); diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index f4684e6..780289e 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -4228,8 +4228,7 @@ bool MasmParser::emitAlignTo(int64_t Alignment) { // Check whether we should use optimal code alignment for this align // directive. const MCSection *Section = getStreamer().getCurrentSectionOnly(); - assert(Section && "must have section to emit alignment"); - if (Section->useCodeAlign()) { + if (MAI.useCodeAlign(*Section)) { getStreamer().emitCodeAlignment(Align(Alignment), &getTargetParser().getSTI(), /*MaxBytesToEmit=*/0); diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp index 1f824b8..d97f4f5 100644 --- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp +++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp @@ -252,7 +252,7 @@ public: if (TypeName == "function") { WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); auto *Current = - cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly()); + static_cast<MCSectionWasm *>(getStreamer().getCurrentSectionOnly()); if (Current->getGroup()) WasmSym->setComdat(true); } else if (TypeName == "global") diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 023f7f2..4f28267 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -18,12 +18,10 @@ using namespace llvm; -MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss, - MCSymbol *Begin) +MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin) : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText), - IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) { - // The initial subsection number is 0. Create a fragment list. - CurFragList = &Subsections.emplace_back(0u, FragList{}).second; + IsBss(IsBss), LinkerRelaxable(false), Name(Name) { + DummyFragment.setParent(this); } MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) { diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp deleted file mode 100644 index 5bf1473..0000000 --- a/llvm/lib/MC/MCSectionCOFF.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//===- lib/MC/MCSectionCOFF.cpp - COFF Code Section Representation --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionCOFF.h" -#include "llvm/BinaryFormat/COFF.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/raw_ostream.h" -#include <cassert> - -using namespace llvm; - -// shouldOmitSectionDirective - Decides whether a '.section' directive -// should be printed before the section name -bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name, - const MCAsmInfo &MAI) const { - if (COMDATSymbol || isUnique()) - return false; - - // FIXME: Does .section .bss/.data/.text work everywhere?? - if (Name == ".text" || Name == ".data" || Name == ".bss") - return true; - - return false; -} - -void MCSectionCOFF::setSelection(int Selection) const { - assert(Selection != 0 && "invalid COMDAT selection type"); - this->Selection = Selection; - Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT; -} - -void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - // standard sections don't require the '.section' - if (shouldOmitSectionDirective(getName(), MAI)) { - OS << '\t' << getName() << '\n'; - return; - } - - OS << "\t.section\t" << getName() << ",\""; - if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA) - OS << 'd'; - if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) - OS << 'b'; - if (getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE) - OS << 'x'; - if (getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE) - OS << 'w'; - else if (getCharacteristics() & COFF::IMAGE_SCN_MEM_READ) - OS << 'r'; - else - OS << 'y'; - if (getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE) - OS << 'n'; - if (getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED) - OS << 's'; - if ((getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) && - !isImplicitlyDiscardable(getName())) - OS << 'D'; - if (getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO) - OS << 'i'; - OS << '"'; - - // unique should be tail of .section directive. - if (isUnique() && !COMDATSymbol) - OS << ",unique," << UniqueID; - - if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) { - if (COMDATSymbol) - OS << ","; - else - OS << "\n\t.linkonce\t"; - switch (Selection) { - case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES: - OS << "one_only"; - break; - case COFF::IMAGE_COMDAT_SELECT_ANY: - OS << "discard"; - break; - case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE: - OS << "same_size"; - break; - case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH: - OS << "same_contents"; - break; - case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE: - OS << "associative"; - break; - case COFF::IMAGE_COMDAT_SELECT_LARGEST: - OS << "largest"; - break; - case COFF::IMAGE_COMDAT_SELECT_NEWEST: - OS << "newest"; - break; - default: - assert(false && "unsupported COFF selection type"); - break; - } - if (COMDATSymbol) { - OS << ","; - COMDATSymbol->print(OS, &MAI); - } - } - - if (isUnique() && COMDATSymbol) - OS << ",unique," << UniqueID; - - OS << '\n'; -} - -bool MCSectionCOFF::useCodeAlign() const { return isText(); } diff --git a/llvm/lib/MC/MCSectionDXContainer.cpp b/llvm/lib/MC/MCSectionDXContainer.cpp deleted file mode 100644 index 7eee59d..0000000 --- a/llvm/lib/MC/MCSectionDXContainer.cpp +++ /dev/null @@ -1,15 +0,0 @@ -//===- lib/MC/MCSectionDXContainer.cpp - DXContainer Section --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionDXContainer.h" - -using namespace llvm; - -void MCSectionDXContainer::printSwitchToSection(const MCAsmInfo &, - const Triple &, raw_ostream &, - uint32_t) const {} diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp deleted file mode 100644 index ef33f9c..0000000 --- a/llvm/lib/MC/MCSectionELF.cpp +++ /dev/null @@ -1,217 +0,0 @@ -//===- lib/MC/MCSectionELF.cpp - ELF Code Section Representation ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionELF.h" -#include "llvm/ADT/Twine.h" -#include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/TargetParser/Triple.h" -#include <cassert> - -using namespace llvm; - -// Decides whether a '.section' directive -// should be printed before the section name. -bool MCSectionELF::shouldOmitSectionDirective(StringRef Name, - const MCAsmInfo &MAI) const { - if (isUnique()) - return false; - - return MAI.shouldOmitSectionDirective(Name); -} - -static void printName(raw_ostream &OS, StringRef Name) { - if (Name.find_first_not_of("0123456789_." - "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { - OS << Name; - return; - } - OS << '"'; - for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { - if (*B == '"') // Unquoted " - OS << "\\\""; - else if (*B != '\\') // Neither " or backslash - OS << *B; - else if (B + 1 == E) // Trailing backslash - OS << "\\\\"; - else { - OS << B[0] << B[1]; // Quoted character - ++B; - } - } - OS << '"'; -} - -void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - if (shouldOmitSectionDirective(getName(), MAI)) { - OS << '\t' << getName(); - if (Subsection) - OS << '\t' << Subsection; - OS << '\n'; - return; - } - - OS << "\t.section\t"; - printName(OS, getName()); - - // Handle the weird solaris syntax if desired. - if (MAI.usesSunStyleELFSectionSwitchSyntax() && - !(Flags & ELF::SHF_MERGE)) { - if (Flags & ELF::SHF_ALLOC) - OS << ",#alloc"; - if (Flags & ELF::SHF_EXECINSTR) - OS << ",#execinstr"; - if (Flags & ELF::SHF_WRITE) - OS << ",#write"; - if (Flags & ELF::SHF_EXCLUDE) - OS << ",#exclude"; - if (Flags & ELF::SHF_TLS) - OS << ",#tls"; - OS << '\n'; - return; - } - - OS << ",\""; - if (Flags & ELF::SHF_ALLOC) - OS << 'a'; - if (Flags & ELF::SHF_EXCLUDE) - OS << 'e'; - if (Flags & ELF::SHF_EXECINSTR) - OS << 'x'; - if (Flags & ELF::SHF_WRITE) - OS << 'w'; - if (Flags & ELF::SHF_MERGE) - OS << 'M'; - if (Flags & ELF::SHF_STRINGS) - OS << 'S'; - if (Flags & ELF::SHF_TLS) - OS << 'T'; - if (Flags & ELF::SHF_LINK_ORDER) - OS << 'o'; - if (Flags & ELF::SHF_GROUP) - OS << 'G'; - if (Flags & ELF::SHF_GNU_RETAIN) - OS << 'R'; - - // If there are os-specific flags, print them. - if (T.isOSSolaris()) - if (Flags & ELF::SHF_SUNW_NODISCARD) - OS << 'R'; - - // If there are target-specific flags, print them. - Triple::ArchType Arch = T.getArch(); - if (Arch == Triple::xcore) { - if (Flags & ELF::XCORE_SHF_CP_SECTION) - OS << 'c'; - if (Flags & ELF::XCORE_SHF_DP_SECTION) - OS << 'd'; - } else if (T.isARM() || T.isThumb()) { - if (Flags & ELF::SHF_ARM_PURECODE) - OS << 'y'; - } else if (T.isAArch64()) { - if (Flags & ELF::SHF_AARCH64_PURECODE) - OS << 'y'; - } else if (Arch == Triple::hexagon) { - if (Flags & ELF::SHF_HEX_GPREL) - OS << 's'; - } else if (Arch == Triple::x86_64) { - if (Flags & ELF::SHF_X86_64_LARGE) - OS << 'l'; - } - - OS << '"'; - - OS << ','; - - // If comment string is '@', e.g. as on ARM - use '%' instead - if (MAI.getCommentString()[0] == '@') - OS << '%'; - else - OS << '@'; - - if (Type == ELF::SHT_INIT_ARRAY) - OS << "init_array"; - else if (Type == ELF::SHT_FINI_ARRAY) - OS << "fini_array"; - else if (Type == ELF::SHT_PREINIT_ARRAY) - OS << "preinit_array"; - else if (Type == ELF::SHT_NOBITS) - OS << "nobits"; - else if (Type == ELF::SHT_NOTE) - OS << "note"; - else if (Type == ELF::SHT_PROGBITS) - OS << "progbits"; - else if (Type == ELF::SHT_X86_64_UNWIND) - OS << "unwind"; - else if (Type == ELF::SHT_MIPS_DWARF) - // Print hex value of the flag while we do not have - // any standard symbolic representation of the flag. - OS << "0x7000001e"; - else if (Type == ELF::SHT_LLVM_ODRTAB) - OS << "llvm_odrtab"; - else if (Type == ELF::SHT_LLVM_LINKER_OPTIONS) - OS << "llvm_linker_options"; - else if (Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE) - OS << "llvm_call_graph_profile"; - else if (Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES) - OS << "llvm_dependent_libraries"; - else if (Type == ELF::SHT_LLVM_SYMPART) - OS << "llvm_sympart"; - else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP) - OS << "llvm_bb_addr_map"; - else if (Type == ELF::SHT_LLVM_OFFLOADING) - OS << "llvm_offloading"; - else if (Type == ELF::SHT_LLVM_LTO) - OS << "llvm_lto"; - else if (Type == ELF::SHT_LLVM_JT_SIZES) - OS << "llvm_jt_sizes"; - else if (Type == ELF::SHT_LLVM_CFI_JUMP_TABLE) - OS << "llvm_cfi_jump_table"; - else - OS << "0x" << Twine::utohexstr(Type); - - if (EntrySize) { - assert((Flags & ELF::SHF_MERGE) || Type == ELF::SHT_LLVM_CFI_JUMP_TABLE); - OS << "," << EntrySize; - } - - if (Flags & ELF::SHF_LINK_ORDER) { - OS << ","; - if (LinkedToSym) - printName(OS, LinkedToSym->getName()); - else - OS << '0'; - } - - if (Flags & ELF::SHF_GROUP) { - OS << ","; - printName(OS, Group.getPointer()->getName()); - if (isComdat()) - OS << ",comdat"; - } - - if (isUnique()) - OS << ",unique," << UniqueID; - - OS << '\n'; - - if (Subsection) { - OS << "\t.subsection\t" << Subsection; - OS << '\n'; - } -} - -bool MCSectionELF::useCodeAlign() const { - return getFlags() & ELF::SHF_EXECINSTR; -} diff --git a/llvm/lib/MC/MCSectionGOFF.cpp b/llvm/lib/MC/MCSectionGOFF.cpp deleted file mode 100644 index 8163e5b..0000000 --- a/llvm/lib/MC/MCSectionGOFF.cpp +++ /dev/null @@ -1,143 +0,0 @@ -//===- MCSectionGOFF.cpp - GOFF Code Section Representation ---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionGOFF.h" -#include "llvm/BinaryFormat/GOFF.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode, - GOFF::ESDAlignment Alignment, - GOFF::ESDLoadingBehavior LoadBehavior, - GOFF::ESDExecutable Executable, bool IsReadOnly, - uint32_t SortKey, uint8_t FillByteValue, - StringRef PartName) { - OS << Name << " CATTR "; - OS << "ALIGN(" << static_cast<unsigned>(Alignment) << ")," - << "FILL(" << static_cast<unsigned>(FillByteValue) << ")"; - switch (LoadBehavior) { - case GOFF::ESD_LB_Deferred: - OS << ",DEFLOAD"; - break; - case GOFF::ESD_LB_NoLoad: - OS << ",NOLOAD"; - break; - default: - break; - } - switch (Executable) { - case GOFF::ESD_EXE_CODE: - OS << ",EXECUTABLE"; - break; - case GOFF::ESD_EXE_DATA: - OS << ",NOTEXECUTABLE"; - break; - default: - break; - } - if (IsReadOnly) - OS << ",READONLY"; - if (Rmode != GOFF::ESD_RMODE_None) { - OS << ','; - OS << "RMODE("; - switch (Rmode) { - case GOFF::ESD_RMODE_24: - OS << "24"; - break; - case GOFF::ESD_RMODE_31: - OS << "31"; - break; - case GOFF::ESD_RMODE_64: - OS << "64"; - break; - case GOFF::ESD_RMODE_None: - break; - } - OS << ')'; - } - if (SortKey) - OS << ",PRIORITY(" << SortKey << ")"; - if (!PartName.empty()) - OS << ",PART(" << PartName << ")"; - OS << '\n'; -} - -static void emitXATTR(raw_ostream &OS, StringRef Name, - GOFF::ESDLinkageType Linkage, - GOFF::ESDExecutable Executable, - GOFF::ESDBindingScope BindingScope) { - OS << Name << " XATTR "; - OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),"; - if (Executable != GOFF::ESD_EXE_Unspecified) - OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA") - << "),"; - if (BindingScope != GOFF::ESD_BSC_Unspecified) { - OS << "SCOPE("; - switch (BindingScope) { - case GOFF::ESD_BSC_Section: - OS << "SECTION"; - break; - case GOFF::ESD_BSC_Module: - OS << "MODULE"; - break; - case GOFF::ESD_BSC_Library: - OS << "LIBRARY"; - break; - case GOFF::ESD_BSC_ImportExport: - OS << "EXPORT"; - break; - default: - break; - } - OS << ')'; - } - OS << '\n'; -} - -void MCSectionGOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - switch (SymbolType) { - case GOFF::ESD_ST_SectionDefinition: { - OS << Name << " CSECT\n"; - Emitted = true; - break; - } - case GOFF::ESD_ST_ElementDefinition: { - getParent()->printSwitchToSection(MAI, T, OS, Subsection); - if (!Emitted) { - emitCATTR(OS, Name, EDAttributes.Rmode, EDAttributes.Alignment, - EDAttributes.LoadBehavior, GOFF::ESD_EXE_Unspecified, - EDAttributes.IsReadOnly, 0, EDAttributes.FillByteValue, - StringRef()); - Emitted = true; - } else - OS << Name << " CATTR\n"; - break; - } - case GOFF::ESD_ST_PartReference: { - MCSectionGOFF *ED = getParent(); - ED->getParent()->printSwitchToSection(MAI, T, OS, Subsection); - if (!Emitted) { - emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode, - ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior, - PRAttributes.Executable, ED->EDAttributes.IsReadOnly, - PRAttributes.SortKey, ED->EDAttributes.FillByteValue, Name); - emitXATTR(OS, Name, PRAttributes.Linkage, PRAttributes.Executable, - PRAttributes.BindingScope); - ED->Emitted = true; - Emitted = true; - } else - OS << ED->getName() << " CATTR PART(" << Name << ")\n"; - break; - } - default: - llvm_unreachable("Wrong section type"); - } -}
\ No newline at end of file diff --git a/llvm/lib/MC/MCSectionMachO.cpp b/llvm/lib/MC/MCSectionMachO.cpp index 67453ce..67c8235 100644 --- a/llvm/lib/MC/MCSectionMachO.cpp +++ b/llvm/lib/MC/MCSectionMachO.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/SectionKind.h" #include "llvm/Support/raw_ostream.h" @@ -92,7 +93,7 @@ ENTRY("" /*FIXME*/, S_ATTR_LOC_RELOC) MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section, unsigned TAA, unsigned reserved2, SectionKind K, MCSymbol *Begin) - : MCSection(SV_MachO, Section, K.isText(), + : MCSection(Section, K.isText(), MachO::isVirtualSection(TAA & MachO::SECTION_TYPE), Begin), TypeAndAttributes(TAA), Reserved2(reserved2) { assert(Segment.size() <= 16 && Section.size() <= 16 && @@ -105,19 +106,20 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section, } } -void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - OS << "\t.section\t" << getSegmentName() << ',' << getName(); +void MCAsmInfoDarwin::printSwitchToSection(const MCSection &Section, uint32_t, + const Triple &T, + raw_ostream &OS) const { + auto &Sec = static_cast<const MCSectionMachO &>(Section); + OS << "\t.section\t" << Sec.getSegmentName() << ',' << Sec.getName(); // Get the section type and attributes. - unsigned TAA = getTypeAndAttributes(); + unsigned TAA = Sec.getTypeAndAttributes(); if (TAA == 0) { OS << '\n'; return; } - MachO::SectionType SectionType = getType(); + MachO::SectionType SectionType = Sec.getType(); assert(SectionType <= MachO::LAST_KNOWN_SECTION_TYPE && "Invalid SectionType specified!"); @@ -135,8 +137,8 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, if (SectionAttrs == 0) { // If we have a S_SYMBOL_STUBS size specified, print it along with 'none' as // the attribute specifier. - if (Reserved2 != 0) - OS << ",none," << Reserved2; + if (Sec.Reserved2 != 0) + OS << ",none," << Sec.Reserved2; OS << '\n'; return; } @@ -164,15 +166,11 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, assert(SectionAttrs == 0 && "Unknown section attributes!"); // If we have a S_SYMBOL_STUBS size specified, print it. - if (Reserved2 != 0) - OS << ',' << Reserved2; + if (Sec.Reserved2 != 0) + OS << ',' << Sec.Reserved2; OS << '\n'; } -bool MCSectionMachO::useCodeAlign() const { - return hasAttribute(MachO::S_ATTR_PURE_INSTRUCTIONS); -} - /// ParseSectionSpecifier - Parse the section specifier indicated by "Spec". /// This is a string that can appear after a .section directive in a mach-o /// flavored .s file. If successful, this fills in the specified Out diff --git a/llvm/lib/MC/MCSectionWasm.cpp b/llvm/lib/MC/MCSectionWasm.cpp deleted file mode 100644 index e25af1c..0000000 --- a/llvm/lib/MC/MCSectionWasm.cpp +++ /dev/null @@ -1,101 +0,0 @@ -//===- lib/MC/MCSectionWasm.cpp - Wasm Code Section Representation --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionWasm.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCSymbolWasm.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -// Decides whether a '.section' directive -// should be printed before the section name. -bool MCSectionWasm::shouldOmitSectionDirective(StringRef Name, - const MCAsmInfo &MAI) const { - return MAI.shouldOmitSectionDirective(Name); -} - -static void printName(raw_ostream &OS, StringRef Name) { - if (Name.find_first_not_of("0123456789_." - "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) { - OS << Name; - return; - } - OS << '"'; - for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) { - if (*B == '"') // Unquoted " - OS << "\\\""; - else if (*B != '\\') // Neither " or backslash - OS << *B; - else if (B + 1 == E) // Trailing backslash - OS << "\\\\"; - else { - OS << B[0] << B[1]; // Quoted character - ++B; - } - } - OS << '"'; -} - -void MCSectionWasm::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - - if (shouldOmitSectionDirective(getName(), MAI)) { - OS << '\t' << getName(); - if (Subsection) - OS << '\t' << Subsection; - OS << '\n'; - return; - } - - OS << "\t.section\t"; - printName(OS, getName()); - OS << ",\""; - - if (IsPassive) - OS << 'p'; - if (Group) - OS << 'G'; - if (SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS) - OS << 'S'; - if (SegmentFlags & wasm::WASM_SEG_FLAG_TLS) - OS << 'T'; - if (SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN) - OS << 'R'; - - OS << '"'; - - OS << ','; - - // If comment string is '@', e.g. as on ARM - use '%' instead - if (MAI.getCommentString()[0] == '@') - OS << '%'; - else - OS << '@'; - - // TODO: Print section type. - - if (Group) { - OS << ","; - printName(OS, Group->getName()); - OS << ",comdat"; - } - - if (isUnique()) - OS << ",unique," << UniqueID; - - OS << '\n'; - - if (Subsection) - OS << "\t.subsection\t" << Subsection << '\n'; -} - -bool MCSectionWasm::useCodeAlign() const { return false; } diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp deleted file mode 100644 index 41043b2..0000000 --- a/llvm/lib/MC/MCSectionXCOFF.cpp +++ /dev/null @@ -1,134 +0,0 @@ -//===- lib/MC/MCSectionXCOFF.cpp - XCOFF Code Section Representation ------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCSectionXCOFF.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" -namespace llvm { -class MCExpr; -class Triple; -} // namespace llvm - -using namespace llvm; - -MCSectionXCOFF::~MCSectionXCOFF() = default; - -void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const { - OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n'; -} - -void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, - raw_ostream &OS, - uint32_t Subsection) const { - if (getKind().isText()) { - if (getMappingClass() != XCOFF::XMC_PR) - report_fatal_error("Unhandled storage-mapping class for .text csect"); - - printCsectDirective(OS); - return; - } - - if (getKind().isReadOnly()) { - if (getMappingClass() != XCOFF::XMC_RO && - getMappingClass() != XCOFF::XMC_TD) - report_fatal_error("Unhandled storage-mapping class for .rodata csect."); - printCsectDirective(OS); - return; - } - - if (getKind().isReadOnlyWithRel()) { - if (getMappingClass() != XCOFF::XMC_RW && - getMappingClass() != XCOFF::XMC_RO && - getMappingClass() != XCOFF::XMC_TD) - report_fatal_error( - "Unexepected storage-mapping class for ReadOnlyWithRel kind"); - printCsectDirective(OS); - return; - } - - // Initialized TLS data. - if (getKind().isThreadData()) { - // We only expect XMC_TL here for initialized TLS data. - if (getMappingClass() != XCOFF::XMC_TL) - report_fatal_error("Unhandled storage-mapping class for .tdata csect."); - printCsectDirective(OS); - return; - } - - if (getKind().isData()) { - switch (getMappingClass()) { - case XCOFF::XMC_RW: - case XCOFF::XMC_DS: - case XCOFF::XMC_TD: - printCsectDirective(OS); - break; - case XCOFF::XMC_TC: - case XCOFF::XMC_TE: - break; - case XCOFF::XMC_TC0: - OS << "\t.toc\n"; - break; - default: - report_fatal_error( - "Unhandled storage-mapping class for .data csect."); - } - return; - } - - if (isCsect() && getMappingClass() == XCOFF::XMC_TD) { - // Common csect type (uninitialized storage) does not have to print csect - // directive for section switching unless it is local. - if (getKind().isCommon() && !getKind().isBSSLocal()) - return; - - assert(getKind().isBSS() && "Unexpected section kind for toc-data"); - printCsectDirective(OS); - return; - } - // Common csect type (uninitialized storage) does not have to print csect - // directive for section switching. - if (isCsect() && getCSectType() == XCOFF::XTY_CM) { - assert((getMappingClass() == XCOFF::XMC_RW || - getMappingClass() == XCOFF::XMC_BS || - getMappingClass() == XCOFF::XMC_UL) && - "Generated a storage-mapping class for a common/bss/tbss csect we " - "don't " - "understand how to switch to."); - // Common symbols and local zero-initialized symbols for TLS and Non-TLS are - // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to cover - // TLS common and zero-initialized local symbols since linkage type (in the - // GlobalVariable) is not accessible in this class. - assert((getKind().isBSSLocal() || getKind().isCommon() || - getKind().isThreadBSS()) && - "wrong symbol type for .bss/.tbss csect"); - // Don't have to print a directive for switching to section for commons and - // zero-initialized TLS data. The '.comm' and '.lcomm' directives of the - // variable will create the needed csect. - return; - } - - // Zero-initialized TLS data with weak or external linkage are not eligible to - // be put into common csect. - if (getKind().isThreadBSS()) { - printCsectDirective(OS); - return; - } - - // XCOFF debug sections. - if (getKind().isMetadata() && isDwarfSect()) { - OS << "\n\t.dwsect " << format("0x%" PRIx32, *getDwarfSubtypeFlags()) - << '\n'; - OS << getName() << ':' << '\n'; - return; - } - - report_fatal_error("Printing for this SectionKind is unimplemented."); -} - -bool MCSectionXCOFF::useCodeAlign() const { return getKind().isText(); } diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index 30198c9..bc73981 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -56,12 +56,11 @@ void MCTargetStreamer::finish() {} void MCTargetStreamer::emitConstantPools() {} -void MCTargetStreamer::changeSection(const MCSection *CurSection, - MCSection *Section, uint32_t Subsection, - raw_ostream &OS) { - Section->printSwitchToSection(*Streamer.getContext().getAsmInfo(), - Streamer.getContext().getTargetTriple(), OS, - Subsection); +void MCTargetStreamer::changeSection(const MCSection *, MCSection *Sec, + uint32_t Subsection, raw_ostream &OS) { + auto &MAI = *Streamer.getContext().getAsmInfo(); + MAI.printSwitchToSection(*Sec, Subsection, + Streamer.getContext().getTargetTriple(), OS); } void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) { @@ -415,7 +414,7 @@ void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { void MCStreamer::emitConditionalAssignment(MCSymbol *Symbol, const MCExpr *Value) {} -void MCStreamer::emitCFISections(bool EH, bool Debug) {} +void MCStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {} void MCStreamer::emitCFIStartProc(bool IsSimple, SMLoc Loc) { if (!FrameInfoStack.empty() && @@ -838,8 +837,8 @@ static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID, if (TextSec == Context.getObjectFileInfo()->getTextSection()) return MainCFISec; - const auto *TextSecCOFF = cast<MCSectionCOFF>(TextSec); - auto *MainCFISecCOFF = cast<MCSectionCOFF>(MainCFISec); + const auto *TextSecCOFF = static_cast<const MCSectionCOFF *>(TextSec); + auto *MainCFISecCOFF = static_cast<MCSectionCOFF *>(MainCFISec); unsigned UniqueID = TextSecCOFF->getOrAssignWinCFISectionID(NextWinCFIID); // If this section is COMDAT, this unwind section should be COMDAT associative @@ -1314,9 +1313,20 @@ void MCStreamer::emitZerofill(MCSection *, MCSymbol *, uint64_t, Align, SMLoc) { } void MCStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size, Align ByteAlignment) {} -void MCStreamer::changeSection(MCSection *Section, uint32_t) { - CurFrag = &Section->getDummyFragment(); + +void MCStreamer::changeSection(MCSection *Sec, uint32_t) { + CurFrag = &Sec->getDummyFragment(); + auto *Sym = Sec->getBeginSymbol(); + if (!Sym || !Sym->isUndefined()) + return; + // In Mach-O, DWARF sections use Begin as a temporary label, requiring a label + // definition, unlike section symbols in other file formats. + if (getContext().getObjectFileType() == MCContext::IsMachO) + emitLabel(Sym); + else + Sym->setFragment(CurFrag); } + void MCStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {} void MCStreamer::emitBytes(StringRef Data) {} void MCStreamer::emitBinaryData(StringRef Data) { emitBytes(Data); } @@ -1358,9 +1368,6 @@ void MCStreamer::switchSection(MCSection *Section, uint32_t Subsection) { changeSection(Section, Subsection); SectionStack.back().first = MCSectionSubPair(Section, Subsection); assert(!Section->hasEnded() && "Section already ended"); - MCSymbol *Sym = Section->getBeginSymbol(); - if (Sym && !Sym->isInSection()) - emitLabel(Sym); } } @@ -1387,9 +1394,6 @@ void MCStreamer::switchSectionNoPrint(MCSection *Section) { SectionStack.back().second = SectionStack.back().first; SectionStack.back().first = MCSectionSubPair(Section, 0); changeSection(Section, 0); - MCSymbol *Sym = Section->getBeginSymbol(); - if (Sym && !Sym->isInSection()) - emitLabel(Sym); } MCSymbol *MCStreamer::endSection(MCSection *Section) { diff --git a/llvm/lib/MC/MCTargetOptions.cpp b/llvm/lib/MC/MCTargetOptions.cpp index bff4b8d..be6d19d 100644 --- a/llvm/lib/MC/MCTargetOptions.cpp +++ b/llvm/lib/MC/MCTargetOptions.cpp @@ -19,7 +19,8 @@ MCTargetOptions::MCTargetOptions() PreserveAsmComments(true), Dwarf64(false), EmitDwarfUnwind(EmitDwarfUnwindType::Default), MCUseDwarfDirectory(DefaultDwarfDirectory), - EmitCompactUnwindNonCanonical(false), PPCUseFullRegisterNames(false) {} + EmitCompactUnwindNonCanonical(false), EmitSFrameUnwind(false), + PPCUseFullRegisterNames(false) {} StringRef MCTargetOptions::getABIName() const { return ABIName; diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp index 2adc291..ff95ff7 100644 --- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp +++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp @@ -41,6 +41,7 @@ MCOPT(int, DwarfVersion) MCOPT(bool, Dwarf64) MCOPT(EmitDwarfUnwindType, EmitDwarfUnwind) MCOPT(bool, EmitCompactUnwindNonCanonical) +MCOPT(bool, EmitSFrameUnwind) MCOPT(bool, ShowMCInst) MCOPT(bool, FatalWarnings) MCOPT(bool, NoWarn) @@ -105,6 +106,11 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() { false)); // By default, use DWARF for non-canonical personalities. MCBINDOPT(EmitCompactUnwindNonCanonical); + static cl::opt<bool> EmitSFrameUnwind( + "gsframe", cl::desc("Whether to emit .sframe unwind sections."), + cl::init(false)); + MCBINDOPT(EmitSFrameUnwind); + static cl::opt<bool> ShowMCInst( "asm-show-inst", cl::desc("Emit internal instruction representation to assembly file")); @@ -188,6 +194,7 @@ MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() { Options.X86Sse2Avx = getX86Sse2Avx(); Options.EmitDwarfUnwind = getEmitDwarfUnwind(); Options.EmitCompactUnwindNonCanonical = getEmitCompactUnwindNonCanonical(); + Options.EmitSFrameUnwind = getEmitSFrameUnwind(); Options.AsSecureLogFile = getAsSecureLogFile(); return Options; diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp index 5891420c..e3ef111 100644 --- a/llvm/lib/MC/MCWasmStreamer.cpp +++ b/llvm/lib/MC/MCWasmStreamer.cpp @@ -58,7 +58,7 @@ void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F, void MCWasmStreamer::changeSection(MCSection *Section, uint32_t Subsection) { MCAssembler &Asm = getAssembler(); - auto *SectionWasm = cast<MCSectionWasm>(Section); + auto *SectionWasm = static_cast<const MCSectionWasm *>(Section); const MCSymbol *Grp = SectionWasm->getGroup(); if (Grp) Asm.registerSymbol(*Grp); diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index 9369bea..1ffe25c 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -157,7 +157,8 @@ void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { // Ensure that the first and the second symbols relative to the section are // the section symbol and the COMDAT symbol. getAssembler().registerSymbol(*Section->getBeginSymbol()); - if (auto *Sym = cast<MCSectionCOFF>(Section)->getCOMDATSymbol()) + if (auto *Sym = + static_cast<const MCSectionCOFF *>(Section)->getCOMDATSymbol()) getAssembler().registerSymbol(*Sym); } diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index 63381b4..898ac5d 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -36,6 +36,20 @@ XCOFFObjectWriter &MCXCOFFStreamer::getWriter() { return static_cast<XCOFFObjectWriter &>(getAssembler().getWriter()); } +void MCXCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) { + MCObjectStreamer::changeSection(Section, Subsection); + auto *Sec = static_cast<const MCSectionXCOFF *>(Section); + // We might miss calculating the symbols difference as absolute value before + // adding fixups when symbol_A without the fragment set is the csect itself + // and symbol_B is in it. + // TODO: Currently we only set the fragment for XMC_PR csects and DWARF + // sections because we don't have other cases that hit this problem yet. + // if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR) + // QualName->setFragment(F); + if (Sec->isDwarfSect() || Sec->getMappingClass() == XCOFF::XMC_PR) + Sec->getQualNameSymbol()->setFragment(CurFrag); +} + bool MCXCOFFStreamer::emitSymbolAttribute(MCSymbol *Sym, MCSymbolAttr Attribute) { auto *Symbol = cast<MCSymbolXCOFF>(Sym); diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 48d2fc6..7b5c3c0 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -126,7 +126,8 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S) const { uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm, const MCSection *Sec) const { uint64_t EndAddr = getSectionAddress(Sec) + Asm.getSectionAddressSize(*Sec); - unsigned Next = cast<MCSectionMachO>(Sec)->getLayoutOrder() + 1; + unsigned Next = + static_cast<const MCSectionMachO *>(Sec)->getLayoutOrder() + 1; if (Next >= SectionOrder.size()) return 0; @@ -259,15 +260,12 @@ void MachObjectWriter::writeSegmentLoadCommand( } void MachObjectWriter::writeSection(const MCAssembler &Asm, - const MCSection &Sec, uint64_t VMAddr, + const MCSectionMachO &Sec, uint64_t VMAddr, uint64_t FileOffset, unsigned Flags, uint64_t RelocationsStart, unsigned NumRelocations) { - uint64_t SectionSize = Asm.getSectionAddressSize(Sec); - const MCSectionMachO &Section = cast<MCSectionMachO>(Sec); - // The offset is unused for virtual sections. - if (Section.isBssSection()) { + if (Sec.isBssSection()) { assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!"); FileOffset = 0; } @@ -275,11 +273,11 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, // struct section (68 bytes) or // struct section_64 (80 bytes) + uint64_t SectionSize = Asm.getSectionAddressSize(Sec); uint64_t Start = W.OS.tell(); (void) Start; - - writeWithPadding(Section.getName(), 16); - writeWithPadding(Section.getSegmentName(), 16); + writeWithPadding(Sec.getName(), 16); + writeWithPadding(Sec.getSegmentName(), 16); if (is64Bit()) { W.write<uint64_t>(VMAddr); // address W.write<uint64_t>(SectionSize); // size @@ -290,14 +288,14 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, assert(isUInt<32>(FileOffset) && "Cannot encode offset of section"); W.write<uint32_t>(FileOffset); - W.write<uint32_t>(Log2(Section.getAlign())); + W.write<uint32_t>(Log2(Sec.getAlign())); assert((!NumRelocations || isUInt<32>(RelocationsStart)) && "Cannot encode offset of relocations"); W.write<uint32_t>(NumRelocations ? RelocationsStart : 0); W.write<uint32_t>(NumRelocations); W.write<uint32_t>(Flags); W.write<uint32_t>(IndirectSymBase.lookup(&Sec)); // reserved1 - W.write<uint32_t>(Section.getStubSize()); // reserved2 + W.write<uint32_t>(Sec.getStubSize()); // reserved2 if (is64Bit()) W.write<uint32_t>(0); // reserved3 @@ -531,7 +529,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // Report errors for use of .indirect_symbol not in a symbol pointer section // or stub section. for (IndirectSymbolData &ISD : IndirectSymbols) { - const MCSectionMachO &Section = cast<MCSectionMachO>(*ISD.Section); + const MCSectionMachO &Section = static_cast<MCSectionMachO &>(*ISD.Section); if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS && @@ -545,7 +543,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // Bind non-lazy symbol pointers first. for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) { - const auto &Section = cast<MCSectionMachO>(*ISD.Section); + const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section); if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS) @@ -559,7 +557,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // Then lazy symbol pointers and symbol stubs. for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) { - const auto &Section = cast<MCSectionMachO>(*ISD.Section); + const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section); if (Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_SYMBOL_STUBS) @@ -684,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) { for (MCSection &Sec : Asm) { if (!Sec.isBssSection()) { SectionOrder.push_back(&Sec); - cast<MCSectionMachO>(Sec).setLayoutOrder(i++); + static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++); } } for (MCSection &Sec : Asm) { if (Sec.isBssSection()) { SectionOrder.push_back(&Sec); - cast<MCSectionMachO>(Sec).setLayoutOrder(i++); + static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++); } } @@ -907,7 +905,7 @@ uint64_t MachObjectWriter::writeObject() { // ... and then the section headers. uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize; for (const MCSection &Section : Asm) { - const auto &Sec = cast<MCSectionMachO>(Section); + const auto &Sec = static_cast<const MCSectionMachO &>(Section); std::vector<RelAndSymbol> &Relocs = Relocations[&Sec]; unsigned NumRelocs = Relocs.size(); uint64_t SectionStart = SectionDataStart + getSectionAddress(&Sec); diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 3b99af4..bfd6334 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -480,7 +480,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F, // The WebAssembly backend should never generate FKF_IsPCRel fixups assert(!Fixup.isPCRel()); - const auto &FixupSection = cast<MCSectionWasm>(*F.getParent()); + const auto &FixupSection = static_cast<MCSectionWasm &>(*F.getParent()); uint64_t C = Target.getConstant(); uint64_t FixupOffset = Asm->getFragmentOffset(F) + Fixup.getOffset(); MCContext &Ctx = getContext(); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 6ad4334..856850d 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -373,7 +373,7 @@ void WinCOFFWriter::defineSymbol(const MCSymbol &MCSym) { COFFSection *Sec = nullptr; MCSectionCOFF *MCSec = nullptr; if (Base && Base->getFragment()) { - MCSec = cast<MCSectionCOFF>(Base->getFragment()->getParent()); + MCSec = static_cast<MCSectionCOFF *>(Base->getFragment()->getParent()); Sec = SectionMap[MCSec]; } @@ -1057,7 +1057,8 @@ uint64_t WinCOFFWriter::writeObject() { continue; } - const auto *AssocMCSec = cast<MCSectionCOFF>(&AssocMCSym->getSection()); + const auto *AssocMCSec = + static_cast<const MCSectionCOFF *>(&AssocMCSym->getSection()); assert(SectionMap.count(AssocMCSec)); COFFSection *AssocSec = SectionMap[AssocMCSec]; diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index 2f6785f..65f543b 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -550,13 +550,13 @@ CsectGroup &XCOFFWriter::getCsectGroup(const MCSectionXCOFF *MCSec) { static MCSectionXCOFF *getContainingCsect(const MCSymbolXCOFF *XSym) { if (XSym->isDefined()) - return cast<MCSectionXCOFF>(XSym->getFragment()->getParent()); + return static_cast<MCSectionXCOFF *>(XSym->getFragment()->getParent()); return XSym->getRepresentedCsect(); } void XCOFFWriter::executePostLayoutBinding() { for (const auto &S : *Asm) { - const auto *MCSec = cast<const MCSectionXCOFF>(&S); + auto *MCSec = static_cast<const MCSectionXCOFF *>(&S); assert(!SectionMap.contains(MCSec) && "Cannot add a section twice."); // If the name does not fit in the storage provided in the symbol table @@ -747,7 +747,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, FixedValue = TOCEntryOffset; } } else if (Type == XCOFF::RelocationType::R_RBR) { - MCSectionXCOFF *ParentSec = cast<MCSectionXCOFF>(F.getParent()); + auto *ParentSec = static_cast<MCSectionXCOFF *>(F.getParent()); assert((SymASec->getMappingClass() == XCOFF::XMC_PR && ParentSec->getMappingClass() == XCOFF::XMC_PR) && "Only XMC_PR csect may have the R_RBR relocation."); @@ -768,7 +768,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup, } XCOFFRelocation Reloc = {Index, FixupOffsetInCsect, SignAndSize, Type}; - MCSectionXCOFF *RelocationSec = cast<MCSectionXCOFF>(F.getParent()); + auto *RelocationSec = static_cast<MCSectionXCOFF *>(F.getParent()); assert(SectionMap.contains(RelocationSec) && "Expected containing csect to exist in map."); SectionMap[RelocationSec]->Relocations.push_back(Reloc); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index cff7ab5..f810368 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -364,6 +364,7 @@ #include "llvm/Transforms/Utils/MoveAutoInit.h" #include "llvm/Transforms/Utils/NameAnonGlobals.h" #include "llvm/Transforms/Utils/PredicateInfo.h" +#include "llvm/Transforms/Utils/ProfileVerify.h" #include "llvm/Transforms/Utils/RelLookupTableConverter.h" #include "llvm/Transforms/Utils/StripGCRelocates.h" #include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index caa78b6..fd89583 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -119,7 +119,6 @@ MODULE_PASS("module-inline", ModuleInlinerPass()) MODULE_PASS("name-anon-globals", NameAnonGlobalPass()) MODULE_PASS("no-op-module", NoOpModulePass()) MODULE_PASS("nsan", NumericalStabilitySanitizerPass()) -MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass()) MODULE_PASS("openmp-opt", OpenMPOptPass()) MODULE_PASS("openmp-opt-postlink", OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)) @@ -520,6 +519,8 @@ FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(errs())) FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(errs())) FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(errs())) FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(errs())) +FUNCTION_PASS("prof-inject", ProfileInjectorPass()) +FUNCTION_PASS("prof-verify", ProfileVerifierPass()) FUNCTION_PASS("reassociate", ReassociatePass()) FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass()) FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib()) diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 5c7b9e0..886add7 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1295,7 +1295,7 @@ Error IndexedInstrProfReader::readHeader() { // Writer first writes the length of compressed string, and then the actual // content. const char *VTableNamePtr = (const char *)Ptr; - if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd()) + if (VTableNamePtr > DataBuffer->getBufferEnd()) return make_error<InstrProfError>(instrprof_error::truncated); VTableName = StringRef(VTableNamePtr, CompressedVTableNamesLen); diff --git a/llvm/lib/Support/AArch64AttributeParser.cpp b/llvm/lib/Support/AArch64AttributeParser.cpp index c675ef2..eed8dba 100644 --- a/llvm/lib/Support/AArch64AttributeParser.cpp +++ b/llvm/lib/Support/AArch64AttributeParser.cpp @@ -8,6 +8,7 @@ //===---------------------------------------------------------------------===// #include "llvm/Support/AArch64AttributeParser.h" +#include "llvm/Support/AArch64BuildAttributes.h" std::vector<llvm::SubsectionAndTagToTagName> & llvm::AArch64AttributeParser::returnTagsNamesMap() { @@ -19,3 +20,29 @@ llvm::AArch64AttributeParser::returnTagsNamesMap() { {"aeabi_feature_and_bits", 2, "Tag_Feature_GCS"}}; return TagsNamesMap; } + +llvm::AArch64BuildAttrSubsections llvm::extractBuildAttributesSubsections( + const llvm::AArch64AttributeParser &Attributes) { + + llvm::AArch64BuildAttrSubsections SubSections; + auto GetPauthValue = [&Attributes](unsigned Tag) { + return Attributes.getAttributeValue("aeabi_pauthabi", Tag).value_or(0); + }; + SubSections.Pauth.TagPlatform = + GetPauthValue(llvm::AArch64BuildAttributes::TAG_PAUTH_PLATFORM); + SubSections.Pauth.TagSchema = + GetPauthValue(llvm::AArch64BuildAttributes::TAG_PAUTH_SCHEMA); + + auto GetFeatureValue = [&Attributes](unsigned Tag) { + return Attributes.getAttributeValue("aeabi_feature_and_bits", Tag) + .value_or(0); + }; + SubSections.AndFeatures |= + GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_BTI); + SubSections.AndFeatures |= + GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_PAC) << 1; + SubSections.AndFeatures |= + GetFeatureValue(llvm::AArch64BuildAttributes::TAG_FEATURE_GCS) << 2; + + return SubSections; +} diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index d5c3cba..8491633 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -68,11 +68,19 @@ template class LLVM_EXPORT_TEMPLATE basic_parser<float>; template class LLVM_EXPORT_TEMPLATE basic_parser<std::string>; template class LLVM_EXPORT_TEMPLATE basic_parser<char>; -template class opt<unsigned>; -template class opt<int>; -template class opt<std::string>; -template class opt<char>; -template class opt<bool>; +#if !(defined(LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS) && defined(_MSC_VER)) +// Only instantiate opt<std::string> when not building a Windows DLL. When +// exporting opt<std::string>, MSVC implicitly exports symbols for +// std::basic_string through transitive inheritance via std::string. These +// symbols may appear in clients, leading to duplicate symbol conflicts. +template class LLVM_EXPORT_TEMPLATE opt<std::string>; +#endif + +template class LLVM_EXPORT_TEMPLATE opt<bool>; +template class LLVM_EXPORT_TEMPLATE opt<char>; +template class LLVM_EXPORT_TEMPLATE opt<int>; +template class LLVM_EXPORT_TEMPLATE opt<unsigned>; + } // namespace cl } // namespace llvm @@ -95,6 +103,15 @@ void parser<float>::anchor() {} void parser<std::string>::anchor() {} void parser<char>::anchor() {} +// These anchor functions instantiate opt<T> and reference its virtual +// destructor to ensure MSVC exports the corresponding vtable and typeinfo when +// building a Windows DLL. Without an explicit reference, MSVC may omit the +// instantiation at link time even if it is marked DLL-export. +void opt_bool_anchor() { opt<bool> anchor{""}; } +void opt_char_anchor() { opt<char> anchor{""}; } +void opt_int_anchor() { opt<int> anchor{""}; } +void opt_unsigned_anchor() { opt<unsigned> anchor{""}; } + //===----------------------------------------------------------------------===// const static size_t DefaultPad = 2; diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index c4b43e1..c52487a 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -176,6 +176,9 @@ public: std::optional<AArch64PACKey::ID> PACKey, uint64_t PACDisc, Register PACAddrDisc); + // Emit the sequence for PAC. + void emitPtrauthSign(const MachineInstr *MI); + // Emit the sequence to compute the discriminator. // // The returned register is either unmodified AddrDisc or ScratchReg. @@ -2175,6 +2178,37 @@ void AArch64AsmPrinter::emitPtrauthAuthResign( OutStreamer->emitLabel(EndSym); } +void AArch64AsmPrinter::emitPtrauthSign(const MachineInstr *MI) { + Register Val = MI->getOperand(1).getReg(); + auto Key = (AArch64PACKey::ID)MI->getOperand(2).getImm(); + uint64_t Disc = MI->getOperand(3).getImm(); + Register AddrDisc = MI->getOperand(4).getReg(); + bool AddrDiscKilled = MI->getOperand(4).isKill(); + + // As long as at least one of Val and AddrDisc is in GPR64noip, a scratch + // register is available. + Register ScratchReg = Val == AArch64::X16 ? AArch64::X17 : AArch64::X16; + assert(ScratchReg != AddrDisc && + "Neither X16 nor X17 is available as a scratch register"); + + // Compute pac discriminator + assert(isUInt<16>(Disc)); + Register DiscReg = emitPtrauthDiscriminator( + Disc, AddrDisc, ScratchReg, /*MayUseAddrAsScratch=*/AddrDiscKilled); + bool IsZeroDisc = DiscReg == AArch64::XZR; + unsigned Opc = getPACOpcodeForKey(Key, IsZeroDisc); + + // paciza x16 ; if IsZeroDisc + // pacia x16, x17 ; if !IsZeroDisc + MCInst PACInst; + PACInst.setOpcode(Opc); + PACInst.addOperand(MCOperand::createReg(Val)); + PACInst.addOperand(MCOperand::createReg(Val)); + if (!IsZeroDisc) + PACInst.addOperand(MCOperand::createReg(DiscReg)); + EmitToStreamer(*OutStreamer, PACInst); +} + void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) { bool IsCall = MI->getOpcode() == AArch64::BLRA; unsigned BrTarget = MI->getOperand(0).getReg(); @@ -2890,6 +2924,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { MI->getOperand(4).getImm(), MI->getOperand(5).getReg()); return; + case AArch64::PAC: + emitPtrauthSign(MI); + return; + case AArch64::LOADauthptrstatic: LowerLOADauthptrstatic(*MI); return; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 02ee517..7b49754 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -164,6 +164,9 @@ static cl::opt<bool> UseFEATCPACodegen( /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; +/// Value type used for NZCV flags. +static constexpr MVT FlagsVT = MVT::i32; + static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7}; @@ -3098,6 +3101,83 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI, return BB; } +// Helper function to find the instruction that defined a virtual register. +// If unable to find such instruction, returns nullptr. +static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI, + Register Reg) { + while (Reg.isVirtual()) { + MachineInstr *DefMI = MRI.getVRegDef(Reg); + assert(DefMI && "Virtual register definition not found"); + unsigned Opcode = DefMI->getOpcode(); + + if (Opcode == AArch64::COPY) { + Reg = DefMI->getOperand(1).getReg(); + // Vreg is defined by copying from physreg. + if (Reg.isPhysical()) + return DefMI; + continue; + } + if (Opcode == AArch64::SUBREG_TO_REG) { + Reg = DefMI->getOperand(2).getReg(); + continue; + } + + return DefMI; + } + return nullptr; +} + +void AArch64TargetLowering::fixupPtrauthDiscriminator( + MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, + MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + + Register AddrDisc = AddrDiscOp.getReg(); + int64_t IntDisc = IntDiscOp.getImm(); + assert(IntDisc == 0 && "Blend components are already expanded"); + + const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc); + if (DiscMI) { + switch (DiscMI->getOpcode()) { + case AArch64::MOVKXi: + // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48". + // #imm should be an immediate and not a global symbol, for example. + if (DiscMI->getOperand(2).isImm() && + DiscMI->getOperand(3).getImm() == 48) { + AddrDisc = DiscMI->getOperand(1).getReg(); + IntDisc = DiscMI->getOperand(2).getImm(); + } + break; + case AArch64::MOVi32imm: + case AArch64::MOVi64imm: + // Small immediate integer constant passed via VReg. + if (DiscMI->getOperand(1).isImm() && + isUInt<16>(DiscMI->getOperand(1).getImm())) { + AddrDisc = AArch64::NoRegister; + IntDisc = DiscMI->getOperand(1).getImm(); + } + break; + } + } + + // For uniformity, always use NoRegister, as XZR is not necessarily contained + // in the requested register class. + if (AddrDisc == AArch64::XZR) + AddrDisc = AArch64::NoRegister; + + // Make sure AddrDisc operand respects the register class imposed by MI. + if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) { + Register TmpReg = MRI.createVirtualRegister(AddrDiscRC); + BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc); + AddrDisc = TmpReg; + } + + AddrDiscOp.setReg(AddrDisc); + IntDiscOp.setImm(IntDisc); +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3196,6 +3276,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true); case AArch64::MOVT_TIZ_PSEUDO: return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true); + + case AArch64::PAC: + fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4), + &AArch64::GPR64noipRegClass); + return BB; } } @@ -3451,7 +3536,7 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, } unsigned Opcode = IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; - return DAG.getNode(Opcode, DL, {MVT::i32, MVT::Other}, {Chain, LHS, RHS}); + return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS}); } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, @@ -3465,7 +3550,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); } - return DAG.getNode(AArch64ISD::FCMP, DL, MVT::i32, LHS, RHS); + return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS); } // The CMP instruction is just an alias for SUBS, and representing it as @@ -3490,7 +3575,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // (a.k.a. ANDS) except that the flags are only guaranteed to work for one // of the signed comparisons. const SDValue ANDSNode = - DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, MVT_CC), + DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT), LHS.getOperand(0), LHS.getOperand(1)); // Replace all users of (and X, Y) with newly generated (ands X, Y) DAG.ReplaceAllUsesWith(LHS, ANDSNode); @@ -3501,7 +3586,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } - return DAG.getNode(Opcode, DL, DAG.getVTList(VT, MVT_CC), LHS, RHS) + return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS) .getValue(1); } @@ -3597,7 +3682,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); - return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); + return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp); } /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be @@ -4036,7 +4121,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); // Check that the result fits into a 32-bit integer. - SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); + SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT); if (IsSigned) { // cmp xreg, wreg, sxtw SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); @@ -4059,12 +4144,12 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { DAG.getConstant(63, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), @@ -4075,7 +4160,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { } // switch (...) if (Opc) { - SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); + SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT); // Emit the AArch64 operation with overflow check. Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); @@ -4177,7 +4262,7 @@ static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) { SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value; SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT); SDValue Cmp = - DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1); + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1); return Cmp.getValue(1); } @@ -4220,16 +4305,15 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); SDLoc DL(Op); - SDVTList VTs = DAG.getVTList(VT0, VT1); - SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS, + SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS, OpRHS, OpCarryIn); SDValue OutFlag = IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); - return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); + return DAG.getMergeValues({Sum, OutFlag}, DL); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { @@ -4254,8 +4338,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { Overflow = DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Value, Overflow); + return DAG.getMergeValues({Value, Overflow}, DL); } // Prefetch operands are: @@ -6813,7 +6896,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), - {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo), + DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()}, StoreNode->getMemoryVT(), StoreNode->getMemOperand()); return Result; } @@ -7037,9 +7121,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op.getOperand(0)); // Generate SUBS & CSEL. - SDValue Cmp = - DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), - Op.getOperand(0), DAG.getConstant(0, DL, VT)); + SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), + Op.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, DAG.getConstant(AArch64CC::PL, DL, MVT::i32), Cmp.getValue(1)); @@ -11108,7 +11191,7 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, SDValue Carry = Op.getOperand(2); // SBCS uses a carry not a borrow so the carry flag should be inverted first. SDValue InvCarry = valueToCarryFlag(Carry, DAG, true); - SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue), + SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS, InvCarry); EVT OpVT = Op.getValueType(); @@ -12441,10 +12524,10 @@ SDValue AArch64TargetLowering::LowerAsmOutputForConstraint( // Get NZCV register. Only update chain when copyfrom is glued. if (Glue.getNode()) { - Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue); + Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue); Chain = Glue.getValue(1); } else - Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32); + Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT); // Extract CC code. SDValue CC = getSETCC(Cond, Glue, DL, DAG); @@ -18020,11 +18103,14 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( unsigned ShlAmt = C2->getZExtValue(); if (auto ShouldADD = *N->user_begin(); ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) { - if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) { - unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8; - if ((1ULL << ShlAmt) == ByteVT && - isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT())) - return false; + if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) { + EVT MemVT = Load->getMemoryVT(); + + if (Load->getValueType(0).isScalableVector()) + return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits(); + + if (isIndexedLoadLegal(ISD::PRE_INC, MemVT)) + return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits(); } } } @@ -18593,7 +18679,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, Created.push_back(And.getNode()); } else { SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); - SDVTList VTs = DAG.getVTList(VT, MVT::i32); + SDVTList VTs = DAG.getVTList(VT, FlagsVT); SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); @@ -19482,10 +19568,10 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { // can select to CCMN to avoid the extra mov SDValue AbsOp1 = DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0)); - CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1, - NZCVOp, Condition, Cmp0); + CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0), + AbsOp1, NZCVOp, Condition, Cmp0); } else { - CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), + CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0), Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); } return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), @@ -25134,8 +25220,9 @@ static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) { if (!TReassocOp && !FReassocOp) return SDValue(); - SDValue NewCmp = DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode), - DAG.getVTList(VT, MVT_CC), CmpOpOther, SubsOp); + SDValue NewCmp = + DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode), + DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp); auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) { if (!ReassocOp) @@ -27161,7 +27248,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, : AArch64SysReg::RNDRRS); SDLoc DL(N); SDValue A = DAG.getNode( - AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::i32, MVT::Other), + AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other), N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32)); SDValue B = DAG.getNode( AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), @@ -27907,16 +27994,16 @@ void AArch64TargetLowering::ReplaceNodeResults( MemVT.getScalarSizeInBits() == 32u || MemVT.getScalarSizeInBits() == 64u)) { + EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext()); SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::LDNP, SDLoc(N), - DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), - MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), - MVT::Other}), + DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}), {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), LoadNode->getMemOperand()); SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT, - Result.getValue(0), Result.getValue(1)); + DAG.getBitcast(HalfVT, Result.getValue(0)), + DAG.getBitcast(HalfVT, Result.getValue(1))); Results.append({Pair, Result.getValue(2) /* Chain */}); return; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index d8403c2..95d0e3b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -182,6 +182,13 @@ public: MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const; + /// Replace (0, vreg) discriminator components with the operands of blend + /// or with (immediate, NoRegister) when possible. + void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, + MachineOperand &IntDiscOp, + MachineOperand &AddrDiscOp, + const TargetRegisterClass *AddrDiscRC) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index bc57537..8685d7a0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,7 +20,6 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -36,7 +35,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" -#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -533,8 +531,9 @@ bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, MBP.LHS = LastInst->getOperand(0); MBP.RHS = MachineOperand::CreateImm(0); - MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE - : MachineBranchPredicate::PRED_EQ; + MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW) + ? MachineBranchPredicate::PRED_NE + : MachineBranchPredicate::PRED_EQ; return false; } @@ -7353,9 +7352,6 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: - case AArch64MachineCombinerPattern::GATHER_LANE_i32: - case AArch64MachineCombinerPattern::GATHER_LANE_i16: - case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7396,252 +7392,11 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } -static bool getGatherPattern(MachineInstr &Root, - SmallVectorImpl<unsigned> &Patterns, - unsigned LoadLaneOpCode, unsigned NumLanes) { - const MachineFunction *MF = Root.getMF(); - - // Early exit if optimizing for size. - if (MF->getFunction().hasMinSize()) - return false; - - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - - // The root of the pattern must load into the last lane of the vector. - if (Root.getOperand(2).getImm() != NumLanes - 1) - return false; - - // Check that we have load into all lanes except lane 0. - // For each load we also want to check that: - // 1. It has a single non-debug use (since we will be replacing the virtual - // register) - // 2. That the addressing mode only uses a single offset register. - auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); - auto Range = llvm::seq<unsigned>(1, NumLanes - 1); - SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end()); - while (!RemainingLanes.empty() && CurrInstr && - CurrInstr->getOpcode() == LoadLaneOpCode && - MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && - CurrInstr->getNumOperands() == 4) { - RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); - CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); - } - - if (!RemainingLanes.empty()) - return false; - - // Match the SUBREG_TO_REG sequence. - if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) - return false; - - // Verify that the subreg to reg loads an integer into the first lane. - auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); - unsigned SingleLaneSizeInBits = 128 / NumLanes; - if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) - return false; - - // Verify that it also has a single non debug use. - if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) - return false; - - switch (NumLanes) { - case 4: - Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); - break; - case 8: - Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); - break; - case 16: - Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); - break; - default: - llvm_unreachable("Got bad number of lanes for gather pattern."); - } - - return true; -} - -/// Search for patterns where we use LD1 instructions to load into -/// separate lanes of an 128 bit Neon register. We can increase Memory Level -/// Parallelism by loading into 2 Neon registers instead. -static bool getLoadPatterns(MachineInstr &Root, - SmallVectorImpl<unsigned> &Patterns) { - - // The pattern searches for loads into single lanes. - switch (Root.getOpcode()) { - case AArch64::LD1i32: - return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); - case AArch64::LD1i16: - return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); - case AArch64::LD1i8: - return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); - default: - return false; - } -} - -static void -generateGatherPattern(MachineInstr &Root, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<Register, unsigned> &InstrIdxForVirtReg, - unsigned Pattern, unsigned NumLanes) { - - MachineFunction &MF = *Root.getParent()->getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - - // Gather the initial load instructions to build the pattern - SmallVector<MachineInstr *, 16> LoadToLaneInstrs; - MachineInstr *CurrInstr = &Root; - for (unsigned i = 0; i < NumLanes - 1; ++i) { - LoadToLaneInstrs.push_back(CurrInstr); - CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); - } - - // Sort the load instructions according to the lane. - llvm::sort(LoadToLaneInstrs, - [](const MachineInstr *A, const MachineInstr *B) { - return A->getOperand(2).getImm() > B->getOperand(2).getImm(); - }); - - MachineInstr *SubregToReg = CurrInstr; - LoadToLaneInstrs.push_back( - MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); - auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); - - const TargetRegisterClass *FPR128RegClass = - MRI.getRegClass(Root.getOperand(0).getReg()); - - auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, - Register SrcRegister, unsigned Lane, - Register OffsetRegister) { - auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); - MachineInstrBuilder LoadIndexIntoRegister = - BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), - NewRegister) - .addReg(SrcRegister) - .addImm(Lane) - .addReg(OffsetRegister, getKillRegState(true)); - InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); - InsInstrs.push_back(LoadIndexIntoRegister); - return NewRegister; - }; - - // Helper to create load instruction based on opcode - auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, - Register OffsetReg) -> MachineInstrBuilder { - unsigned Opcode; - switch (NumLanes) { - case 4: - Opcode = AArch64::LDRSui; - break; - case 8: - Opcode = AArch64::LDRHui; - break; - case 16: - Opcode = AArch64::LDRBui; - break; - default: - llvm_unreachable( - "Got unsupported number of lanes in machine-combiner gather pattern"); - } - // Immediate offset load - return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) - .addReg(OffsetReg) - .addImm(0); // immediate offset - }; - - // Load the remaining lanes into register 0. - auto LanesToLoadToReg0 = - llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, - LoadToLaneInstrsAscending.begin() + NumLanes / 2); - auto PrevReg = SubregToReg->getOperand(0).getReg(); - for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { - PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, - LoadInstr->getOperand(3).getReg()); - DelInstrs.push_back(LoadInstr); - } - auto LastLoadReg0 = PrevReg; - - // First load into register 1. Perform a LDRSui to zero out the upper lanes in - // a single instruction. - auto Lane0Load = *LoadToLaneInstrsAscending.begin(); - auto OriginalSplitLoad = - *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); - auto DestRegForMiddleIndex = MRI.createVirtualRegister( - MRI.getRegClass(Lane0Load->getOperand(0).getReg())); - - MachineInstrBuilder MiddleIndexLoadInstr = - CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, - OriginalSplitLoad->getOperand(3).getReg()); - - InstrIdxForVirtReg.insert( - std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); - InsInstrs.push_back(MiddleIndexLoadInstr); - DelInstrs.push_back(OriginalSplitLoad); - - // Subreg To Reg instruction for register 1. - auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); - unsigned SubregType; - switch (NumLanes) { - case 4: - SubregType = AArch64::ssub; - break; - case 8: - SubregType = AArch64::hsub; - break; - case 16: - SubregType = AArch64::bsub; - break; - default: - llvm_unreachable( - "Got invalid NumLanes for machine-combiner gather pattern"); - } - - auto SubRegToRegInstr = - BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), - DestRegForSubregToReg) - .addImm(0) - .addReg(DestRegForMiddleIndex, getKillRegState(true)) - .addImm(SubregType); - InstrIdxForVirtReg.insert( - std::make_pair(DestRegForSubregToReg, InsInstrs.size())); - InsInstrs.push_back(SubRegToRegInstr); - - // Load remaining lanes into register 1. - auto LanesToLoadToReg1 = - llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, - LoadToLaneInstrsAscending.end()); - PrevReg = SubRegToRegInstr->getOperand(0).getReg(); - for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { - PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, - LoadInstr->getOperand(3).getReg()); - if (Index == NumLanes / 2 - 2) { - break; - } - DelInstrs.push_back(LoadInstr); - } - auto LastLoadReg1 = PrevReg; - - // Create the final zip instruction to combine the results. - MachineInstrBuilder ZipInstr = - BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), - Root.getOperand(0).getReg()) - .addReg(LastLoadReg0) - .addReg(LastLoadReg1); - InsInstrs.push_back(ZipInstr); -} - CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: - case AArch64MachineCombinerPattern::GATHER_LANE_i32: - case AArch64MachineCombinerPattern::GATHER_LANE_i16: - case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7671,10 +7426,6 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; - // Load patterns - if (getLoadPatterns(Root, Patterns)) - return true; - return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8930,21 +8681,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } - case AArch64MachineCombinerPattern::GATHER_LANE_i32: { - generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, - Pattern, 4); - break; - } - case AArch64MachineCombinerPattern::GATHER_LANE_i16: { - generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, - Pattern, 8); - break; - } - case AArch64MachineCombinerPattern::GATHER_LANE_i8: { - generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, - Pattern, 16); - break; - } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 02734866..7c255da 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,10 +172,6 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, - - GATHER_LANE_i32, - GATHER_LANE_i16, - GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9f8a257..07cacfa 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -430,26 +430,27 @@ def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">; def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, SDTCisVT<1, i32>]>; + SDTCisInt<0>, + SDTCisVT<1, FlagsVT>]>; // SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, - SDTCisVT<3, i32>]>; + SDTCisVT<3, FlagsVT>]>; // SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, - SDTCisVT<1, i32>, - SDTCisVT<4, i32>]>; + SDTCisVT<1, FlagsVT>, + SDTCisVT<4, FlagsVT>]>; def SDT_AArch64Brcond : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>, - SDTCisVT<2, i32>]>; + SDTCisVT<2, FlagsVT>]>; def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; @@ -458,22 +459,22 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>, - SDTCisVT<4, i32>]>; + SDTCisVT<4, FlagsVT>]>; def SDT_AArch64CCMP : SDTypeProfile<1, 5, - [SDTCisVT<0, i32>, + [SDTCisVT<0, FlagsVT>, SDTCisInt<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, SDTCisInt<4>, SDTCisVT<5, i32>]>; def SDT_AArch64FCCMP : SDTypeProfile<1, 5, - [SDTCisVT<0, i32>, + [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisInt<3>, SDTCisInt<4>, SDTCisVT<5, i32>]>; -def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, +def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, FlagsVT>, SDTCisFP<1>, SDTCisSameAs<2, 1>]>; def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; @@ -518,10 +519,10 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64ldiapp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; -def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; def SDT_AArch64stilp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; -def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var @@ -1124,10 +1125,10 @@ def AArch64probedalloca SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPMayStore]>; -// MRS, also sets the flags via a glue. +// MRS, also sets the flags. def AArch64mrs : SDNode<"AArch64ISD::MRS", SDTypeProfile<2, 1, [SDTCisVT<0, i64>, - SDTCisVT<1, i32>, + SDTCisVT<1, FlagsVT>, SDTCisVT<2, i32>]>, [SDNPHasChain]>; @@ -2032,7 +2033,7 @@ let Predicates = [HasPAuth] in { def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb"), op>; } - defm PAC : SignAuth<0b000, 0b010, "pac", int_ptrauth_sign>; + defm PAC : SignAuth<0b000, 0b010, "pac", null_frag>; defm AUT : SignAuth<0b001, 0b011, "aut", null_frag>; def XPACI : ClearAuth<0, "xpaci">; @@ -2152,6 +2153,26 @@ let Predicates = [HasPAuth] in { let Uses = []; } + // PAC pseudo instruction. In AsmPrinter, it is expanded into an actual PAC* + // instruction immediately preceded by the discriminator computation. + // This enforces the expected immediate modifier is used for signing, even + // if an attacker is able to substitute AddrDisc. + def PAC : Pseudo<(outs GPR64:$SignedVal), + (ins GPR64:$Val, i32imm:$Key, i64imm:$Disc, GPR64noip:$AddrDisc), + [], "$SignedVal = $Val">, Sched<[WriteI, ReadI]> { + let isCodeGenOnly = 1; + let hasSideEffects = 0; + let mayStore = 0; + let mayLoad = 0; + let Size = 12; + let Defs = [X16, X17]; + let usesCustomInserter = 1; + } + + // A standalone pattern is used, so that literal 0 can be passed as $Disc. + def : Pat<(int_ptrauth_sign GPR64:$Val, timm:$Key, GPR64noip:$AddrDisc), + (PAC GPR64:$Val, $Key, 0, GPR64noip:$AddrDisc)>; + // AUT and re-PAC a value, using different keys/data. // This directly manipulates x16/x17, which are the only registers that // certain OSs guarantee are safe to use for sensitive operations. @@ -3934,6 +3955,26 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw", def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; +// load zero-extended i32, bitcast to f64 +def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; + +// load zero-extended i16, bitcast to f64 +def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; + +// load zero-extended i8, bitcast to f64 +def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; + +// load zero-extended i16, bitcast to f32 +def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; + +// load zero-extended i8, bitcast to f32 +def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; + // Pre-fetch. def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", [(AArch64Prefetch timm:$Rt, diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 61bf87f..1a7609b 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -305,7 +305,8 @@ def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">; def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">; // Condition code regclass. -def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { +defvar FlagsVT = i32; +def CCR : RegisterClass<"AArch64", [FlagsVT], 32, (add NZCV)> { let CopyCost = -1; // Don't allow copying of status registers. // CCR is not allocatable. diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index bafb8d0..8a5b5ba 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -32,10 +32,29 @@ AArch64SelectionDAGInfo::AArch64SelectionDAGInfo() void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, const SDNode *N) const { + SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); + #ifndef NDEBUG + // Some additional checks not yet implemented by verifyTargetNode. + constexpr MVT FlagsVT = MVT::i32; switch (N->getOpcode()) { - default: - return SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N); + case AArch64ISD::SUBS: + assert(N->getValueType(1) == FlagsVT); + break; + case AArch64ISD::ADC: + case AArch64ISD::SBC: + assert(N->getOperand(2).getValueType() == FlagsVT); + break; + case AArch64ISD::ADCS: + case AArch64ISD::SBCS: + assert(N->getValueType(1) == FlagsVT); + assert(N->getOperand(2).getValueType() == FlagsVT); + break; + case AArch64ISD::CSEL: + case AArch64ISD::CSINC: + case AArch64ISD::BRCOND: + assert(N->getOperand(3).getValueType() == FlagsVT); + break; case AArch64ISD::SADDWT: case AArch64ISD::SADDWB: case AArch64ISD::UADDWT: diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index c218831..85de2d5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -36,7 +36,7 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, // SHF_AARCH64_PURECODE flag set if the "+execute-only" target feature is // present. if (TM.getMCSubtargetInfo()->hasFeature(AArch64::FeatureExecuteOnly)) { - auto *Text = cast<MCSectionELF>(TextSection); + auto *Text = static_cast<MCSectionELF *>(TextSection); Text->setFlags(Text->getFlags() | ELF::SHF_AARCH64_PURECODE); } } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 08f547a..6257e99 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -523,7 +523,8 @@ void AArch64TargetELFStreamer::finish() { // mark it execute-only if it is empty and there is at least one // execute-only section in the object. if (any_of(Asm, [](const MCSection &Sec) { - return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_AARCH64_PURECODE; + return static_cast<const MCSectionELF &>(Sec).getFlags() & + ELF::SHF_AARCH64_PURECODE; })) { auto *Text = static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 3d4a14b..1a9bce5 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -9,8 +9,6 @@ #include "AArch64MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 1ac340a..a22a17a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -132,7 +132,8 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section, // But only if they don't point to a few forbidden sections. if (!Symbol.isInSection()) return true; - const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection()); + const MCSectionMachO &RefSec = + static_cast<MCSectionMachO &>(Symbol.getSection()); if (RefSec.getType() == MachO::S_CSTRING_LITERALS) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index e4e7bdc..8b8fc8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts", + "HasFmaMixBF16Insts", + "true", + "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions" +>; + def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts", "HasIEEEMinimumMaximumInsts", "true", @@ -167,6 +173,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16", "Has v_minimum3_f16 and v_maximum3_f16 instructions" >; +def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16", + "HasMin3Max3PKF16", + "true", + "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions" +>; + def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16", "HasMinimum3Maximum3PKF16", "true", @@ -256,12 +268,24 @@ def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", "S_INST_PREFETCH instruction causes shader to hang" >; +def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts", + "HasVmemPrefInsts", + "true", + "Has flat_prefect_b8 and global_prefetch_b8 instructions" +>; + def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch", "HasSafeSmemPrefetch", "true", "SMEM prefetches do not fail on illegal address" >; +def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", + "HasSafeCUPrefetch", + "true", + "VMEM CU scope prefetches do not fail on illegal address" +>; + def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -559,6 +583,12 @@ def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts", "Has bf16 conversion instructions" >; +def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts", + "HasBF16PackedInsts", + "true", + "Has bf16 packed instructions (fma, add, mul, max, min)" +>; + def FeatureVOP3P : SubtargetFeature<"vop3p", "HasVOP3PInsts", "true", @@ -1349,6 +1379,10 @@ def FeatureLshlAddU64Inst : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true", "Has v_lshl_add_u64 instruction">; +def FeatureAddSubU64Insts + : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", + "Has v_add_u64 and v_sub_u64 instructions">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -1989,7 +2023,10 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, + FeatureBF16PackedInsts, FeatureCvtPkF16F32Inst, + FeatureFmaMixBF16Insts, + FeatureMin3Max3PKF16, FeatureMinimum3Maximum3PKF16, FeaturePrngInst, FeaturePermlane16Swap, @@ -2002,7 +2039,9 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureFlatBufferGlobalAtomicFaddF64Inst, FeatureMemoryAtomicFAddF32DenormalSupport, FeatureKernargPreload, + FeatureVmemPrefInsts, FeatureLshlAddU64Inst, + FeatureAddSubU64Insts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2349,6 +2388,10 @@ def HasMinimum3Maximum3F16 : Predicate<"Subtarget->hasMinimum3Maximum3F16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>; +def HasMin3Max3PKF16 : + Predicate<"Subtarget->hasMin3Max3PKF16()">, + AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>; + def HasMinimum3Maximum3PKF16 : Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">, AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>; @@ -2472,6 +2515,9 @@ def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">, def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">, AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>; +def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">, + AssemblerPredicate<(all_of FeatureBF16PackedInsts)>; + def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; @@ -2573,6 +2619,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">, def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, AssemblerPredicate<(all_of FeatureFmaMixInsts)>; +def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">, + AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>; + def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<(all_of FeatureDLInsts)>; @@ -2771,12 +2820,18 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, AssemblerPredicate<(all_of FeatureXF32Insts)>; +def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">, + AssemblerPredicate<(all_of FeatureVmemPrefInsts)>; + def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, AssemblerPredicate<(all_of FeatureAshrPkInsts)>; def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>; +def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, + AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index dedee46..49d8b44 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1383,7 +1383,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, - &AAIndirectCallInfo::ID, &AAInstanceInfo::ID}); + &AAIndirectCallInfo::ID}); AttributorConfig AC(CGUpdater); AC.IsClosedWorldModule = Options.IsClosedWorld; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 891d362..c01e5d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,6 +137,9 @@ def gi_global_offset : def gi_global_saddr : GIComplexOperandMatcher<s64, "selectGlobalSAddr">, GIComplexPatternEquiv<GlobalSAddr>; +def gi_global_saddr_cpol : + GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">, + GIComplexPatternEquiv<GlobalSAddrCPol>; def gi_global_saddr_glc : GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">, GIComplexPatternEquiv<GlobalSAddrGLC>; @@ -446,5 +449,8 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">, GISDNodeXFormEquiv<as_hw_round_mode>; +def gi_prefetch_loc : GICustomOperandRenderer<"renderPrefetchLoc">, + GISDNodeXFormEquiv<PrefetchLoc>; + def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">, GISDNodeXFormEquiv<MFMALdScaleXForm>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 00979f4..f36935d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) { return LLT::scalar(32); } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI); - -static void unmergeReadAnyLane(MachineIRBuilder &B, - SmallVectorImpl<Register> &SgprDstParts, - LLT UnmergeTy, Register VgprSrc, - const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static Register buildReadLane(MachineIRBuilder &, Register, + const RegisterBankInfo &, ReadLaneFnTy); + +template <typename ReadLaneFnTy> +static void +unmergeReadAnyLane(MachineIRBuilder &B, SmallVectorImpl<Register> &SgprDstParts, + LLT UnmergeTy, Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID); auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc); for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { - SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI)); + SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL)); } } -static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, - const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc, + const RegisterBankInfo &RBI, + ReadLaneFnTy BuildRL) { LLT Ty = B.getMRI()->getType(VgprSrc); const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID); if (Ty.getSizeInBits() == 32) { - return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc}) - .getReg(0); + Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty}); + return BuildRL(B, SgprDst, VgprSrc).getReg(0); } SmallVector<Register, 8> SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildRL); return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0); } -void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, - Register VgprSrc, const RegisterBankInfo &RBI) { +template <typename ReadLaneFnTy> +static void buildReadLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI, + ReadLaneFnTy BuildReadLane) { LLT Ty = B.getMRI()->getType(VgprSrc); if (Ty.getSizeInBits() == 32) { - B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); + BuildReadLane(B, SgprDst, VgprSrc); return; } SmallVector<Register, 8> SgprDstParts; - unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI, + BuildReadLane); B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0); } + +void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + return buildReadLane( + B, SgprDst, VgprSrc, RBI, + [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) { + return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); + }); +} + +void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + return buildReadLane( + B, SgprDst, VgprSrc, RBI, + [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) { + return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst) + .addReg(VgprSrc); + }); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 0c89bb5..5e1000e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -51,6 +51,8 @@ private: void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI); +void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, + const RegisterBankInfo &RBI); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5a2416de..dfaa145 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2020,6 +2020,22 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, @@ -3861,58 +3877,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +// Match lowered fpext from bf16 to f32. This is a bit operation extending +// a 16-bit value with 16-bit of zeroes at LSB: +// +// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val))))) +// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true +// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false +static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) { + if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST) + return SDValue(); + Op = Op.getOperand(0); + + IsExtractHigh = false; + if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) { + auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0)); + if (!Low16 || !Low16->isZero()) + return SDValue(); + Op = stripBitcast(Op.getOperand(1)); + if (Op.getValueType() != MVT::bf16) + return SDValue(); + return Op; + } + + if (Op.getValueType() != MVT::i32) + return SDValue(); + + if (Op.getOpcode() == ISD::AND) { + if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Mask->getZExtValue() == 0xffff0000) { + IsExtractHigh = true; + return Op.getOperand(0); + } + } + return SDValue(); + } + + if (Op.getOpcode() == ISD::SHL) { + if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Amt->getZExtValue() == 16) + return Op.getOperand(0); + } + } + + return SDValue(); +} + // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + MVT VT) const { Mods = 0; SelectVOP3ModsImpl(In, Src, Mods); + bool IsExtractHigh = false; if (Src.getOpcode() == ISD::FP_EXTEND) { Src = Src.getOperand(0); - assert(Src.getValueType() == MVT::f16); - Src = stripBitcast(Src); + } else if (VT == MVT::bf16) { + SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh); + if (!B16) + return false; + Src = B16; + } else + return false; - // Be careful about folding modifiers if we already have an abs. fneg is - // applied last, so we don't want to apply an earlier fneg. - if ((Mods & SISrcMods::ABS) == 0) { - unsigned ModsTmp; - SelectVOP3ModsImpl(Src, Src, ModsTmp); + if (Src.getValueType() != VT && + (VT != MVT::bf16 || Src.getValueType() != MVT::i32)) + return false; - if ((ModsTmp & SISrcMods::NEG) != 0) - Mods ^= SISrcMods::NEG; + Src = stripBitcast(Src); - if ((ModsTmp & SISrcMods::ABS) != 0) - Mods |= SISrcMods::ABS; - } + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); - // op_sel/op_sel_hi decide the source type and source. - // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. - // If the sources's op_sel is set, it picks the high half of the source - // register. + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; - Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(Src, Src)) { - Mods |= SISrcMods::OP_SEL_0; + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } - // TODO: Should we try to look for neg/abs here? - } + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the source + // register. - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } - return true; + Mods |= SISrcMods::OP_SEL_1; + if (IsExtractHigh || + (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { + Mods |= SISrcMods::OP_SEL_0; + + // TODO: Should we try to look for neg/abs here? } - return false; + // Prevent unnecessary subreg COPY to VGPR_16 + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + } + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16)) return false; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; @@ -3921,7 +3993,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - SelectVOP3PMadMixModsImpl(In, Src, Mods); + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 6123d75..5636d89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -19,6 +19,7 @@ #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -167,6 +168,9 @@ private: bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; + bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const; @@ -254,11 +258,15 @@ private: bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + MVT VT) const; bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 877c3ac..266dee1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5774,6 +5774,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const { + const MachineInstr &I = *Root.getParent(); + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL; + return selectGlobalSAddr(Root, PassedCPol); +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); } @@ -7068,6 +7078,17 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); } +void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + uint32_t V = MI.getOperand(2).getImm(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) + << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + MIB.addImm(V); +} + /// Convert from 2-bit value to enum values used for op_sel* source modifiers. void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 5f7f05c..fe9743d0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -261,6 +261,8 @@ private: InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectGlobalSAddrCPol(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -414,6 +416,10 @@ private: void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + + void renderPrefetchLoc(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e7bf88d..fedfa3f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, assert(Ty.isScalar()); unsigned Size = Ty.getSizeInBits(); + if (ST.hasVectorMulU64() && Size == 64) + return true; + unsigned NumParts = Size / 32; assert((Size % 32) == 0); assert(NumParts >= 2); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index ba66134..e187959 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -23,6 +23,8 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -115,126 +117,233 @@ public: VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; - bool isLaneMask(Register Reg) { - const RegisterBank *RB = MRI.getRegBankOrNull(Reg); - if (RB && RB->getID() == AMDGPU::VCCRegBankID) - return true; + bool isLaneMask(Register Reg); + std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode); + std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src); + Register getReadAnyLaneSrc(Register Src); + void replaceRegWithOrBuildCopy(Register Dst, Register Src); - const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); - return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); - } + bool tryEliminateReadAnyLane(MachineInstr &Copy); + void tryCombineCopy(MachineInstr &MI); + void tryCombineS1AnyExt(MachineInstr &MI); +}; - void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { - MI.eraseFromParent(); - if (Optional0 && isTriviallyDead(*Optional0, MRI)) - Optional0->eraseFromParent(); - } +bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) + return true; - std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { - MachineInstr *MatchMI = MRI.getVRegDef(Src); - if (MatchMI->getOpcode() != Opcode) - return {nullptr, Register()}; - return {MatchMI, MatchMI->getOperand(1).getReg()}; - } + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); +} - void tryCombineCopy(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - // Skip copies of physical registers. - if (!Dst.isVirtual() || !Src.isVirtual()) - return; - - // This is a cross bank copy, sgpr S1 to lane mask. - // - // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) - // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) - // -> - // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) - if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { - auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); - assert(Trunc && MRI.getType(TruncS32Src) == S32 && - "sgpr S1 must be result of G_TRUNC of sgpr S32"); - - B.setInstr(MI); - // Ensure that truncated bits in BoolSrc are 0. - auto One = B.buildConstant({SgprRB, S32}, 1); - auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); - B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); - cleanUpAfterCombine(MI, Trunc); - return; - } +std::pair<MachineInstr *, Register> +AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) + return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; +} + +std::pair<GUnmerge *, int> +AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { + MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); + if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) + return {nullptr, -1}; + + Register RALSrc = ReadAnyLane->getOperand(1).getReg(); + if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI)) + return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; - // Src = G_AMDGPU_READANYLANE RALSrc - // Dst = COPY Src - // -> - // Dst = RALSrc - if (MRI.getRegBankOrNull(Dst) == VgprRB && - MRI.getRegBankOrNull(Src) == SgprRB) { - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (!RAL) - return; - - assert(MRI.getRegBank(RALSrc) == VgprRB); - MRI.replaceRegWith(Dst, RALSrc); - cleanUpAfterCombine(MI, RAL); - return; + return {nullptr, -1}; +} + +Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { + // Src = G_AMDGPU_READANYLANE RALSrc + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); + if (RAL) + return RALSrc; + + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc + // LoSgpr = G_AMDGPU_READANYLANE LoVgpr + // HiSgpr = G_AMDGPU_READANYLANE HiVgpr + // Src G_MERGE_VALUES LoSgpr, HiSgpr + auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI); + if (Merge) { + unsigned NumElts = Merge->getNumSources(); + auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); + if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) + return {}; + + // Check if all elements are from same unmerge and there is no shuffling. + for (unsigned i = 1; i < NumElts; ++i) { + auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); + if (UnmergeI != Unmerge || (unsigned)IdxI != i) + return {}; } + return Unmerge->getSourceReg(); } - void tryCombineS1AnyExt(MachineInstr &MI) { - // %Src:sgpr(S1) = G_TRUNC %TruncSrc - // %Dst = G_ANYEXT %Src:sgpr(S1) - // -> - // %Dst = G_... %TruncSrc - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - if (MRI.getType(Src) != S1) - return; - - auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); - if (!Trunc) - return; - - LLT DstTy = MRI.getType(Dst); - LLT TruncSrcTy = MRI.getType(TruncSrc); - - if (DstTy == TruncSrcTy) { - MRI.replaceRegWith(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc + // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ... + // ..., Src, ... = G_UNMERGE_VALUES SourceReg + auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI); + if (!UnMerge) + return {}; + + int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); + Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI); + if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources()) + return {}; + + Register SrcRegIdx = Merge->getSourceReg(Idx); + if (MRI.getType(Src) != MRI.getType(SrcRegIdx)) + return {}; + + auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE); + if (RALEl) + return RALElSrc; + + return {}; +} + +void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst, + Register Src) { + if (Dst.isVirtual()) + MRI.replaceRegWith(Dst, Src); + else + B.buildCopy(Dst, Src); +} + +bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( + MachineInstr &Copy) { + Register Dst = Copy.getOperand(0).getReg(); + Register Src = Copy.getOperand(1).getReg(); + + // Skip non-vgpr Dst + if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB) + : !TRI.isVGPR(MRI, Dst)) + return false; + + // Skip physical source registers and source registers with register class + if (!Src.isVirtual() || MRI.getRegClassOrNull(Src)) + return false; + + Register RALDst = Src; + MachineInstr &SrcMI = *MRI.getVRegDef(Src); + if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) + RALDst = SrcMI.getOperand(1).getReg(); + + Register RALSrc = getReadAnyLaneSrc(RALDst); + if (!RALSrc) + return false; + + B.setInstr(Copy); + if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { + // Src = READANYLANE RALSrc Src = READANYLANE RALSrc + // Dst = Copy Src $Dst = Copy Src + // -> -> + // Dst = RALSrc $Dst = Copy RALSrc + replaceRegWithOrBuildCopy(Dst, RALSrc); + } else { + // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc + // Src = G_BITCAST RALDst Src = G_BITCAST RALDst + // Dst = Copy Src Dst = Copy Src + // -> -> + // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst + // Dst = NewVgpr $Dst = Copy NewVgpr + auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); + replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0)); + } + + eraseInstr(Copy, MRI); + return true; +} + +void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) { + if (tryEliminateReadAnyLane(MI)) + return; + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + // Skip copies of physical registers. + if (!Dst.isVirtual() || !Src.isVirtual()) + return; + + // This is a cross bank copy, sgpr S1 to lane mask. + // + // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) + // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) + // -> + // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1 + // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32) + if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { + auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); + assert(Trunc && MRI.getType(TruncS32Src) == S32 && + "sgpr S1 must be result of G_TRUNC of sgpr S32"); B.setInstr(MI); + // Ensure that truncated bits in BoolSrc are 0. + auto One = B.buildConstant({SgprRB, S32}, 1); + auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); + B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); + eraseInstr(MI, MRI); + } +} - if (DstTy == S32 && TruncSrcTy == S64) { - auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); - MRI.replaceRegWith(Dst, Unmerge.getReg(0)); - cleanUpAfterCombine(MI, Trunc); - return; - } +void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) { + // %Src:sgpr(S1) = G_TRUNC %TruncSrc + // %Dst = G_ANYEXT %Src:sgpr(S1) + // -> + // %Dst = G_... %TruncSrc + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (MRI.getType(Src) != S1) + return; + + auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); + if (!Trunc) + return; + + LLT DstTy = MRI.getType(Dst); + LLT TruncSrcTy = MRI.getType(TruncSrc); + + if (DstTy == TruncSrcTy) { + MRI.replaceRegWith(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; + } - if (DstTy == S64 && TruncSrcTy == S32) { - B.buildMergeLikeInstr(MI.getOperand(0).getReg(), - {TruncSrc, B.buildUndef({SgprRB, S32})}); - cleanUpAfterCombine(MI, Trunc); - return; - } + B.setInstr(MI); - if (DstTy == S32 && TruncSrcTy == S16) { - B.buildAnyExt(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + if (DstTy == S32 && TruncSrcTy == S64) { + auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); + MRI.replaceRegWith(Dst, Unmerge.getReg(0)); + eraseInstr(MI, MRI); + return; + } - if (DstTy == S16 && TruncSrcTy == S32) { - B.buildTrunc(Dst, TruncSrc); - cleanUpAfterCombine(MI, Trunc); - return; - } + if (DstTy == S64 && TruncSrcTy == S32) { + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), + {TruncSrc, B.buildUndef({SgprRB, S32})}); + eraseInstr(MI, MRI); + return; + } - llvm_unreachable("missing anyext + trunc combine"); + if (DstTy == S32 && TruncSrcTy == S16) { + B.buildAnyExt(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; } -}; + + if (DstTy == S16 && TruncSrcTy == S32) { + B.buildTrunc(Dst, TruncSrc); + eraseInstr(MI, MRI); + return; + } + + llvm_unreachable("missing anyext + trunc combine"); +} // Search through MRI for virtual registers with sgpr register bank and S1 LLT. [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 411159c..f471881 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -33,7 +33,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper( MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules) : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()), - MUI(MUI), RBI(RBI), RBLRules(RBLRules), + MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()), SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} @@ -56,6 +56,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +bool RegBankLegalizeHelper::executeInWaterfallLoop( + MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs) { + // Track use registers which have already been expanded with a readfirstlane + // sequence. This may have multiple uses if moving a sequence. + DenseMap<Register, Register> WaterfalledRegMap; + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction &MF = B.getMF(); + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); + unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg; + if (IsWave32) { + MovExecOpc = AMDGPU::S_MOV_B32; + MovExecTermOpc = AMDGPU::S_MOV_B32_term; + XorTermOpc = AMDGPU::S_XOR_B32_term; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; + ExecReg = AMDGPU::EXEC_LO; + } else { + MovExecOpc = AMDGPU::S_MOV_B64; + MovExecTermOpc = AMDGPU::S_MOV_B64_term; + XorTermOpc = AMDGPU::S_XOR_B64_term; + AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; + ExecReg = AMDGPU::EXEC; + } + +#ifndef NDEBUG + const int OrigRangeSize = std::distance(Range.begin(), Range.end()); +#endif + + MachineRegisterInfo &MRI = *B.getMRI(); + Register SaveExecReg = MRI.createVirtualRegister(WaveRC); + Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); + + // Don't bother using generic instructions/registers for the exec mask. + B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg); + + Register SavedExec = MRI.createVirtualRegister(WaveRC); + + // To insert the loop we need to split the block. Move everything before + // this point to a new block, and insert a new empty block before this + // instruction. + MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + MF.insert(MBBI, LoopBB); + MF.insert(MBBI, BodyBB); + MF.insert(MBBI, RestoreExecBB); + MF.insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(BodyBB); + BodyBB->addSuccessor(RestoreExecBB); + BodyBB->addSuccessor(LoopBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); + + MBB.addSuccessor(LoopBB); + RestoreExecBB->addSuccessor(RemainderBB); + + B.setInsertPt(*LoopBB, LoopBB->end()); + + // +-MBB:------------+ + // | ... | + // | %0 = G_INST_1 | + // | %Dst = MI %Vgpr | + // | %1 = G_INST_2 | + // | ... | + // +-----------------+ + // -> + // +-MBB-------------------------------+ + // | ... | + // | %0 = G_INST_1 | + // | %SaveExecReg = S_MOV_B32 $exec_lo | + // +----------------|------------------+ + // | /------------------------------| + // V V | + // +-LoopBB---------------------------------------------------------------+ | + // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | | + // | instead of executing for each lane, see if other lanes had | | + // | same value for %Vgpr and execute for them also. | | + // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | | + // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | | + // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | | + // | exec is active for lanes with the same "CurrentLane value" in Vgpr | | + // +----------------|-----------------------------------------------------+ | + // V | + // +-BodyBB------------------------------------------------------------+ | + // | %Dst = MI %CurrentLaneReg:sgpr(s32) | | + // | executed only for active lanes and written to Dst | | + // | $exec = S_XOR_B32 $exec, %SavedExec | | + // | set active lanes to 0 in SavedExec, lanes that did not write to | | + // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | | + // | SI_WATERFALL_LOOP LoopBB |-----| + // +----------------|--------------------------------------------------+ + // V + // +-RestoreExecBB--------------------------+ + // | $exec_lo = S_MOV_B32_term %SaveExecReg | + // +----------------|-----------------------+ + // V + // +-RemainderBB:----------------------+ + // | %1 = G_INST_2 | + // | ... | + // +---------------------------------- + + + // Move the instruction into the loop body. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); + + // Figure out the iterator range after splicing the instructions. + MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator(); + auto NewEnd = BodyBB->end(); + assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); + + B.setMBB(*LoopBB); + Register CondReg; + + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.all_uses()) { + Register OldReg = Op.getReg(); + if (!SGPROperandRegs.count(OldReg)) + continue; + + // See if we already processed this register in another instruction in + // the sequence. + auto OldVal = WaterfalledRegMap.find(OldReg); + if (OldVal != WaterfalledRegMap.end()) { + Op.setReg(OldVal->second); + continue; + } + + Register OpReg = Op.getReg(); + LLT OpTy = MRI.getType(OpReg); + + // TODO: support for agpr + assert(MRI.getRegBank(OpReg) == VgprRB); + Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy}); + buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI); + + // Build the comparison(s), CurrentLaneReg == OpReg. + unsigned OpSize = OpTy.getSizeInBits(); + unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32; + LLT PartTy = LLT::scalar(PartSize); + unsigned NumParts = OpSize / PartSize; + SmallVector<Register, 8> OpParts; + SmallVector<Register, 8> CurrentLaneParts; + + if (NumParts == 1) { + OpParts.push_back(OpReg); + CurrentLaneParts.push_back(CurrentLaneReg); + } else { + auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg); + auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg); + for (unsigned i = 0; i < NumParts; ++i) { + OpParts.push_back(UnmergeOp.getReg(i)); + CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i)); + } + } + + for (unsigned i = 0; i < NumParts; ++i) { + Register CmpReg = MRI.createVirtualRegister(VccRB_S1); + B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]); + + if (!CondReg) + CondReg = CmpReg; + else + CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0); + } + + Op.setReg(CurrentLaneReg); + + // Make sure we don't re-process this register again. + WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg())); + } + } + + // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection. + Register CondRegLM = + MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)}); + B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg); + + // Update EXEC, save the original EXEC value to SavedExec. + B.buildInstr(AndSaveExecOpc) + .addDef(SavedExec) + .addReg(CondRegLM, RegState::Kill); + MRI.setSimpleHint(SavedExec, CondRegLM); + + B.setInsertPt(*BodyBB, BodyBB->end()); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec); + + // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use + // s_cbranch_scc0? + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. + B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); + + // Save the EXEC mask before the loop. + B.setInsertPt(MBB, MBB.end()); + B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg); + + // Restore the EXEC mask after the loop. + B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin()); + B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg); + + // Set the insert point after the original instruction, so any new + // instructions will be in the remainder. + B.setInsertPt(*RemainderBB, RemainderBB->begin()); + + return true; +} + void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown, LLT MergeTy) { MachineFunction &MF = B.getMF(); @@ -391,7 +609,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, switch (Mapping.LoweringMethod) { case DoNotLower: - return; + break; case VccExtToSel: return lowerVccExtToSel(MI); case UniExtToSel: { @@ -527,7 +745,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, } } - // TODO: executeInWaterfallLoop(... WaterfallSgprs) + if (!WaterfallSgprs.empty()) { + MachineBasicBlock::iterator I = MI.getIterator(); + executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs); + } } LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { @@ -539,6 +760,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case Vgpr16: return LLT::scalar(16); case Sgpr32: + case Sgpr32_WF: case Sgpr32Trunc: case Sgpr32AExt: case Sgpr32AExtBoolInReg: @@ -577,6 +799,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { case VgprV2S32: return LLT::fixed_vector(2, 32); case SgprV4S32: + case SgprV4S32_WF: case VgprV4S32: case UniInVgprV4S32: return LLT::fixed_vector(4, 32); @@ -650,6 +873,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { return VccRB; case Sgpr16: case Sgpr32: + case Sgpr32_WF: case Sgpr64: case Sgpr128: case SgprP1: @@ -662,6 +886,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case SgprV2S16: case SgprV2S32: case SgprV4S32: + case SgprV4S32_WF: case SgprB32: case SgprB64: case SgprB96: @@ -923,6 +1148,14 @@ void RegBankLegalizeHelper::applyMappingSrc( } break; } + // sgpr waterfall, scalars and vectors + case Sgpr32_WF: + case SgprV4S32_WF: { + assert(Ty == getTyFromID(MethodIDs[i])); + if (RB != SgprRB) + SgprWaterfallOperandRegs.insert(Reg); + break; + } // sgpr and vgpr scalars with extend case Sgpr32AExt: { // Note: this ext allows S1, and it is meant to be combined away. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index 08cc7d4..db965d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -32,6 +32,7 @@ class RegBankLegalizeHelper { const MachineUniformityInfo &MUI; const RegisterBankInfo &RBI; const RegBankLegalizeRules &RBLRules; + const bool IsWave32; const RegisterBank *SgprRB; const RegisterBank *VgprRB; const RegisterBank *VccRB; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a60855c..5a6ad40 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -529,7 +529,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ICMP}) .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) - .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}) + .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}}); addRulesForGOpcs({G_FCMP}) .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}}) @@ -666,11 +667,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, // clang-format off addRulesForGOpcs({G_LOAD}) .Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}}) + .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}}) .Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}}) .Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}}) .Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}}) .Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}) + .Any({{{UniB64, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}}) + .Any({{{UniB96, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}}) + .Any({{{UniB128, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}}) .Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}}) .Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}}) @@ -684,6 +689,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads) .Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads) .Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads) + .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}}) .Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}}) .Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}}) .Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load @@ -698,11 +704,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}}); // clang-format on - addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector) - .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) - .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}); + addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB) + .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) + .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); addRulesForGOpcs({G_STORE}) .Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}}) @@ -716,7 +726,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_PTR_ADD}) .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) - .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}); + .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}) + .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}}); addRulesForGOpcs({G_INTTOPTR}) .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 7243d75..1391440 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -188,7 +188,11 @@ enum RegBankLLTMappingApplyID { Sgpr32Trunc, - // Src only modifiers: waterfalls, extends + // Src only modifiers: execute in waterfall loop if divergent + Sgpr32_WF, + SgprV4S32_WF, + + // Src only modifiers: extends Sgpr32AExt, Sgpr32AExtBoolInReg, Sgpr32SExt, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f1caf24..c5a1d9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Special case for s_mul_u64. There is not a vector equivalent of // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector // multiplications. - if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { + if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL && + DstTy.getSizeInBits() == 64) { applyMappingSMULU64(B, OpdMapper); return; } @@ -3500,19 +3501,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingMAD_64_32(B, OpdMapper); return; case AMDGPU::G_PREFETCH: { - if (!Subtarget.hasPrefetch() || !Subtarget.hasSafeSmemPrefetch()) { + if (!Subtarget.hasSafeSmemPrefetch() && !Subtarget.hasVmemPrefInsts()) { MI.eraseFromParent(); return; } Register PtrReg = MI.getOperand(0).getReg(); unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); - if (PtrBank == AMDGPU::VGPRRegBankID) { + if (PtrBank == AMDGPU::VGPRRegBankID && + (!Subtarget.hasVmemPrefInsts() || !MI.getOperand(3).getImm())) { + // Cannot do I$ prefetch with divergent pointer. MI.eraseFromParent(); return; } unsigned AS = MRI.getType(PtrReg).getAddressSpace(); - if (!AMDGPU::isFlatGlobalAddrSpace(AS) && - AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + if ((!AMDGPU::isFlatGlobalAddrSpace(AS) && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) || + (!Subtarget.hasSafeSmemPrefetch() && + (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + !MI.getOperand(3).getImm() /* I$ prefetch */))) { MI.eraseFromParent(); return; } @@ -3973,7 +3979,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; } else { - OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); + if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64()) + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + else + OpdsMapping[0] = + getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); @@ -5170,6 +5180,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_load_tr16_b128: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr6_b96: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: @@ -5432,6 +5448,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: + return getDefaultMappingVOP(MI); default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index a8e1967..f580f43 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -159,7 +159,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // If the inputs are tied and the same register, we can shortcut and // directly replace the register. - if (Src2->getReg() != CopySrcReg) { + if (!Src2->isReg() || Src2->getReg() != CopySrcReg || + Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) { LLVM_DEBUG( dbgs() << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1e44be8..6878744 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -61,6 +61,7 @@ protected: bool EnableRealTrue16Insts = false; bool HasBF16TransInsts = false; bool HasBF16ConversionInsts = false; + bool HasBF16PackedInsts = false; bool HasMadMixInsts = false; bool HasMadMacF32Insts = false; bool HasDsSrc2Insts = false; @@ -209,6 +210,8 @@ public: return HasBF16ConversionInsts; } + bool hasBF16PackedInsts() const { return HasBF16PackedInsts; } + bool hasMadMixInsts() const { return HasMadMixInsts; } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 679c55d..7207c25 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -13,6 +13,7 @@ let WantsRoot = true in { def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; + def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>; } @@ -464,6 +465,37 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n let sve = 0; } +class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64:$vaddr), string asm = " $vaddr"> : + FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> { + let has_vdst = 0; + let has_data = 0; + let mayLoad = 1; + let mayStore = 1; + let VM_CNT = 0; + let LGKM_CNT = 0; +} + +multiclass FLAT_Flat_Prefetch_Pseudo<string opName> { + def "" : FLAT_Prefetch_Pseudo<opName>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let enabled_saddr = 1; + } +} + +multiclass FLAT_Global_Prefetch_Pseudo<string opName> { + let is_flat_global = 1, has_saddr = 1 in { + def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64:$vaddr), " $vaddr, off">, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">, + GlobalSaddrTable<1, opName> { + let enabled_saddr = 1; + } + } +} + class FlatScratchInst <string sv_op, string mode> { string SVOp = sv_op; string Mode = mode; @@ -1162,6 +1194,16 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">; defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; +let SubtargetPredicate = isGFX125xOnly in { +defm FLAT_LOAD_MONITOR_B32 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b32", VGPR_32>; +defm FLAT_LOAD_MONITOR_B64 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b64", VReg_64>; +defm FLAT_LOAD_MONITOR_B128 : FLAT_Flat_Load_Pseudo <"flat_load_monitor_b128", VReg_128>; + +defm GLOBAL_LOAD_MONITOR_B32 : FLAT_Global_Load_Pseudo <"global_load_monitor_b32", VGPR_32>; +defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b64", VReg_64>; +defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = isGFX12Plus in { let Uses = [EXEC, M0] in { defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>; @@ -1218,6 +1260,11 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in "global_atomic_pk_add_f16", VGPR_32, v2f16 >; +let SubtargetPredicate = HasVmemPrefInsts in { + defm FLAT_PREFETCH_B8 : FLAT_Flat_Prefetch_Pseudo<"flat_prefetch_b8">; + defm GLOBAL_PREFETCH_B8 : FLAT_Global_Prefetch_Pseudo<"global_prefetch_b8">; +} + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -1228,6 +1275,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN (inst $vaddr, $offset) >; +class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1249,8 +1301,8 @@ class FlatSignedLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Value >; class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, 0, $in) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1264,8 +1316,8 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT >; class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, (i32 0)) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1278,6 +1330,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $saddr, $voffset, $offset, $cpol) >; +class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))), + (inst $vaddr, $offset, $cpol) +>; + +class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))), + (inst $saddr, $voffset, $offset, $cpol) +>; + class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), @@ -1459,8 +1521,8 @@ class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT >; class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), - (inst $vaddr, $saddr, $offset, 0) + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))), + (inst $vaddr, $saddr, $offset, $cpol) >; multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { @@ -1473,6 +1535,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp } } +multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadSignedPat_CPOL<inst, node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatSignedLoadPat_D16 <inst, node, vt> { let AddedComplexity = 10; @@ -2009,6 +2081,16 @@ let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>; } +let OtherPredicates = [isGFX125xOnly] in { + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32, int_amdgcn_flat_load_monitor_b32, i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64, int_amdgcn_flat_load_monitor_b64, v2i32>; + def : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>; + + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32, int_amdgcn_global_load_monitor_b32, i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64, int_amdgcn_global_load_monitor_b64, v2i32>; + defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>; +} // End SubtargetPredicate = isGFX125xOnly + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -2138,6 +2220,77 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f } // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch] +def PrefetchLoc: SDNodeXForm<timm, [{ + uint32_t V = N->getZExtValue(); + V = (AMDGPU::CPol::SCOPE_MASK - (V & AMDGPU::CPol::SCOPE_MASK)) << AMDGPU::CPol::SCOPE_SHIFT; + if (!Subtarget->hasSafeCUPrefetch()) + V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe + return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32); +}]>; + +def prefetch_flat : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; + }]; +} + +def prefetch_global : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); }]> { + let GISelPredicateCode = [{ + return (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::GLOBAL_ADDRESS || + ((*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !Subtarget->hasSafeSmemPrefetch()); + }]; +} + +multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatternOperator rw> { + def : GCNPat < + (prefetch_kind (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 0, 25); + } + + def : GCNPat < + (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc))) + > { + let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30); + } +} + +multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> { + def : GCNPat < + (intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol) + >; + + def : GCNPat < + (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol), + (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> { + let AddedComplexity = 11; + } +} + +let SubtargetPredicate = HasVmemPrefInsts in { + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_zero>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_zero>; + + // Patterns for forced vector prefetch with rw = 1. + defm : FlatPrefetchPats<"FLAT_PREFETCH_B8", prefetch_flat, i32imm_one>; + defm : FlatPrefetchPats<"GLOBAL_PREFETCH_B8", prefetch_global, i32imm_one>; + + + // Patterns for target intrinsics + defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>; + defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>; +} // End SubtargetPredicate = HasVmemPrefInsts + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -3210,6 +3363,17 @@ multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME defm TENSOR_SAVE : VFLAT_Real_gfx1250<0x06e>; defm TENSOR_STOP : VFLAT_Real_gfx1250<0x06f>; +defm FLAT_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; +defm GLOBAL_PREFETCH_B8 : VFLAT_Real_AllAddr_gfx1250<0x05d>; + +defm FLAT_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm FLAT_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm FLAT_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + +defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; +defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; +defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7d6723a..334afd3 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC, const SIRegisterInfo *STI) { - return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR); + return STI->isSGPRClass(RC) + ? SGPR + : (STI->isAGPRClass(RC) + ? AGPR + : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR)); } void GCNRegPressure::inc(unsigned Reg, diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 3749b6d..ea33a22 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -29,43 +29,57 @@ class raw_ostream; class SlotIndex; struct GCNRegPressure { - enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS }; + enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; GCNRegPressure() { clear(); } - bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; } + bool empty() const { + return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR]; + } void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } - /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p - /// UnifiedVGPRFile + /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure + /// dependent upon \p UnifiedVGPRFile unsigned getVGPRNum(bool UnifiedVGPRFile) const { if (UnifiedVGPRFile) { - return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR]) - : Value[VGPR]; + return Value[AGPR] + ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR]) + : Value[VGPR] + Value[AVGPR]; } - return std::max(Value[VGPR], Value[AGPR]); + // AVGPR assignment priority is based on the width of the register. Account + // AVGPR pressure as VGPR. + return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]); } /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs - /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file. + /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified + /// VGPR file. inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs, - unsigned NumAGPRs) { - return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + + unsigned NumAGPRs, + unsigned NumAVGPRs) { + + // Assume AVGPRs will be assigned as VGPRs. + return alignTo(NumArchVGPRs + NumAVGPRs, + AMDGPU::IsaInfo::getArchVGPRAllocGranule()) + NumAGPRs; } - /// \returns the ArchVGPR32 pressure - unsigned getArchVGPRNum() const { return Value[VGPR]; } + /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be + /// allocated as VGPR + unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; } /// \returns the AccVGPR32 pressure unsigned getAGPRNum() const { return Value[AGPR]; } + /// \returns the AVGPR32 pressure + unsigned getAVGPRNum() const { return Value[AVGPR]; } unsigned getVGPRTuplesWeight() const { - return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]); + return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR], + Value[TOTAL_KINDS + AGPR]); } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index a655308..ce1ce68 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1911,14 +1911,12 @@ void PreRARematStage::rematerialize() { for (auto &[DefMI, Remat] : Rematerializations) { MachineBasicBlock::iterator InsertPos(Remat.UseMI); Register Reg = DefMI->getOperand(0).getReg(); - unsigned SubReg = DefMI->getOperand(0).getSubReg(); unsigned DefRegion = MIRegion.at(DefMI); // Rematerialize DefMI to its use block. - TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI, - *DAG.TRI); + TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, + AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); Remat.RematMI = &*std::prev(InsertPos); - Remat.RematMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); // Update region boundaries in regions we sinked from (remove defining MI) @@ -2064,14 +2062,13 @@ void PreRARematStage::finalizeGCNSchedStage() { MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second); MachineBasicBlock *MBB = RegionBB[DefRegion]; Register Reg = RematMI.getOperand(0).getReg(); - unsigned SubReg = RematMI.getOperand(0).getSubReg(); // Re-rematerialize MI at the end of its original region. Note that it may // not be rematerialized exactly in the same position as originally within // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI); + TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, + *DAG.TRI); MachineInstr *NewMI = &*std::prev(InsertPos); - NewMI->getOperand(0).setSubReg(SubReg); DAG.LIS->InsertMachineInstrInMaps(*NewMI); auto UseRegion = MIRegion.find(Remat.UseMI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8b758b0..88a269f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -123,6 +123,7 @@ protected: bool HasSMemRealTime = false; bool HasIntClamp = false; bool HasFmaMixInsts = false; + bool HasFmaMixBF16Insts = false; bool HasMovrel = false; bool HasVGPRIndexMode = false; bool HasScalarDwordx3Loads = false; @@ -244,7 +245,9 @@ protected: bool HasVMEMtoScalarWriteHazard = false; bool HasSMEMtoVectorWriteHazard = false; bool HasInstFwdPrefetchBug = false; + bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; + bool HasSafeCUPrefetch = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -265,8 +268,10 @@ protected: bool HasIEEEMinimumMaximumInsts = false; bool HasMinimum3Maximum3F32 = false; bool HasMinimum3Maximum3F16 = false; + bool HasMin3Max3PKF16 = false; bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; + bool HasAddSubU64Insts = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -460,6 +465,8 @@ public: return HasFmaMixInsts; } + bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; } + bool hasCARRY() const { return true; } @@ -985,8 +992,12 @@ public: bool hasPrefetch() const { return GFX12Insts; } + bool hasVmemPrefInsts() const { return HasVmemPrefInsts; } + bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; } + bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } + // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1306,7 +1317,7 @@ public: bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } - bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; } + bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } @@ -1387,6 +1398,8 @@ public: return HasMinimum3Maximum3F16; } + bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; } + bool hasTanhInsts() const { return HasTanhInsts; } bool hasAddPC64Inst() const { return GFX1250Insts; } @@ -1500,6 +1513,12 @@ public: bool hasVOPD3() const { return GFX1250Insts; } + // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. + bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. + bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 429ce0e0..a33dbfa 100644 --- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -270,5 +270,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); } } + finalizeBundles(MF); return false; } diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp index 2a3b42e..eff5b0a 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp @@ -138,7 +138,6 @@ void R600PassConfig::addPreSched2() { void R600PassConfig::addPreEmitPass() { addPass(createR600MachineCFGStructurizerPass()); addPass(createR600ExpandSpecialInstrsPass()); - addPass(&FinalizeMachineBundlesID); addPass(createR600Packetizer()); addPass(createR600ControlFlowFinalizer()); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 3902d4c..40b8bcd 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -392,11 +392,13 @@ enum CPol { TH_ATOMIC_CASCADE = 4, // Cascading vs regular // Scope - SCOPE = 0x3 << 3, // All Scope bits - SCOPE_CU = 0 << 3, - SCOPE_SE = 1 << 3, - SCOPE_DEV = 2 << 3, - SCOPE_SYS = 3 << 3, + SCOPE_SHIFT = 3, + SCOPE_MASK = 0x3, + SCOPE = SCOPE_MASK << SCOPE_SHIFT, // All Scope bits + SCOPE_CU = 0 << SCOPE_SHIFT, + SCOPE_SE = 1 << SCOPE_SHIFT, + SCOPE_DEV = 2 << SCOPE_SHIFT, + SCOPE_SYS = 3 << SCOPE_SHIFT, NV = 1 << 5, // Non-volatile bit diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e5d1eaa..b77da4d 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat( switch (OpTy) { case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0); break; case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1); break; default: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d65c3ae..8d51ec6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -874,13 +874,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); - if (Subtarget->hasScalarSMulU64()) + if (Subtarget->hasVectorMulU64()) + setOperationAction(ISD::MUL, MVT::i64, Legal); + else if (Subtarget->hasScalarSMulU64()) setOperationAction(ISD::MUL, MVT::i64, Custom); if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch()) + if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts()) setOperationAction(ISD::PREFETCH, MVT::Other, Custom); if (Subtarget->hasIEEEMinimumMaximumInsts()) { @@ -944,6 +946,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasBF16PackedInsts()) { + setOperationAction( + {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA}, + MVT::v2bf16, Legal); + } + if (Subtarget->hasBF16TransInsts()) { setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal); } @@ -1053,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { // where this is OK to use. bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || - (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && + return DestVT.getScalarType() == MVT::f32 && + ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + SrcVT.getScalarType() == MVT::f16) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() && + SrcVT.getScalarType() == MVT::bf16)) && // TODO: This probably only requires no input flushing? denormalModeIsFlushAllF32(DAG.getMachineFunction()); } @@ -1467,6 +1477,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_load_tr4_b64: case Intrinsic::amdgcn_ds_load_tr8_b64: @@ -1540,7 +1556,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_s_prefetch_data: { + case Intrinsic::amdgcn_s_prefetch_data: + case Intrinsic::amdgcn_flat_prefetch: + case Intrinsic::amdgcn_global_prefetch: { Info.opc = ISD::INTRINSIC_VOID; Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); Info.ptrVal = CI.getArgOperand(0); @@ -1591,10 +1609,16 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_load_monitor_b128: + case Intrinsic::amdgcn_flat_load_monitor_b32: + case Intrinsic::amdgcn_flat_load_monitor_b64: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_monitor_b128: + case Intrinsic::amdgcn_global_load_monitor_b32: + case Intrinsic::amdgcn_global_load_monitor_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_global_load_tr4_b64: @@ -4432,19 +4456,28 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, } SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { - if (Op->isDivergent()) + if (Op->isDivergent() && + (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4))) + // Cannot do I$ prefetch with divergent pointer. return SDValue(); switch (cast<MemSDNode>(Op)->getAddressSpace()) { case AMDGPUAS::FLAT_ADDRESS: case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: break; + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + if (Subtarget->hasSafeSmemPrefetch()) + break; + [[fallthrough]]; default: return SDValue(); } + // I$ prefetch + if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4)) + return SDValue(); + return Op; } @@ -5415,6 +5448,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); + if (ST.hasAddSubU64Insts()) { + auto I = BuildMI(*BB, MI, DL, + TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64 + : AMDGPU::V_SUB_U64_e64), + Dest.getReg()) + .add(Src0) + .add(Src1) + .addImm(0); // clamp + TII->legalizeOperands(*I); + MI.eraseFromParent(); + return BB; + } + if (IsAdd && ST.hasLshlAddU64Inst()) { auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), Dest.getReg()) @@ -14047,7 +14093,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, case ISD::FMAXIMUMNUM: case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: - return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); + return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) || + (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16()); case ISD::FMINIMUM: case ISD::FMAXIMUM: return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) || @@ -14132,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget->has16BitInsts()) || + (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) || + (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) || (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 9faf497..dd3f2fe 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2108,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { assert(TII->isFLAT(MI)); - // All flat instructions use the VMEM counter. - assert(TII->usesVM_CNT(MI)); + // All flat instructions use the VMEM counter except prefetch. + if (!TII->usesVM_CNT(MI)) + return false; // If there are no memory operands then conservatively assume the flat // operation may access VMEM. @@ -2295,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } - // A Flat memory operation must access at least one address space. - assert(FlatASCount); - // This is a flat memory operation that access both VMEM and LDS, so note it // - it will require that both the VM and LGKM be flushed to zero if it is // pending when a VM or LGKM dependency occurs. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 571f3ef..8d6c1d0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(DstHi); } break; + + case AMDGPU::V_MAX_BF16_PSEUDO_e64: + assert(ST.hasBF16PackedInsts()); + MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16)); + MI.addOperand(MachineOperand::CreateImm(0)); // op_sel + MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo + MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi + auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1); + auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1); + break; } + return true; } @@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI, } bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, - const MachineOperand *MO0, unsigned OpIdx1, - const MachineOperand *MO1) const { + unsigned OpIdx1) const { const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0]; const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1]; - const TargetRegisterClass *DefinedRC1 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr; - const TargetRegisterClass *DefinedRC0 = - OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr; unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + const MachineOperand &MO0 = MI.getOperand(OpIdx0); + const MachineOperand &MO1 = MI.getOperand(OpIdx1); + // Swap doesn't breach constant bus or literal limits // It may move literal to position other than src0, this is not allowed // pre-gfx10 However, most test cases need literals in Src0 for VOP // FIXME: After gfx9, literal can be in place other than Src0 if (isVALU(MI)) { - if ((int)OpIdx0 == Src0Idx && !MO0->isReg() && - !isInlineConstant(*MO0, OpInfo1)) + if ((int)OpIdx0 == Src0Idx && !MO0.isReg() && + !isInlineConstant(MO0, OpInfo1)) return false; - if ((int)OpIdx1 == Src0Idx && !MO1->isReg() && - !isInlineConstant(*MO1, OpInfo0)) + if ((int)OpIdx1 == Src0Idx && !MO1.isReg() && + !isInlineConstant(MO1, OpInfo0)) return false; } - if ((int)OpIdx1 != Src0Idx && MO0->isReg()) { - if (!DefinedRC1) + if ((int)OpIdx1 != Src0Idx && MO0.isReg()) { + if (OpInfo1.RegClass == -1) return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx1, *MO0) && - (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1)); + return isLegalRegOperand(MI, OpIdx1, MO0) && + (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1)); } - if ((int)OpIdx0 != Src0Idx && MO1->isReg()) { - if (!DefinedRC0) + if ((int)OpIdx0 != Src0Idx && MO1.isReg()) { + if (OpInfo0.RegClass == -1) return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; - return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) && - isLegalRegOperand(MI, OpIdx0, *MO1); + return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) && + isLegalRegOperand(MI, OpIdx0, MO1); } // No need to check 64-bit literals since swapping does not bring new // 64-bit literals into current instruction to fold to 32-bit - return isImmOperandLegal(MI, OpIdx1, *MO0); + return isImmOperandLegal(MI, OpIdx1, MO0); } MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - MachineOperand &Src1 = MI.getOperand(Src1Idx); - if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) { + if (!isLegalToSwap(MI, Src0Idx, Src1Idx)) return nullptr; - } + MachineInstr *CommutedMI = nullptr; + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src0.isReg() && Src1.isReg()) { // Be sure to copy the source modifiers to the right place. CommutedMI = @@ -7361,6 +7372,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MUL_U64: + if (ST.hasVectorMulU64()) { + NewOpcode = AMDGPU::V_MUL_U64_e64; + break; + } // Split s_mul_u64 in 32-bit vector multiplications. splitScalarSMulU64(Worklist, Inst, MDT); Inst.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 800ea9a..2ffb783 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -197,8 +197,7 @@ protected: AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const; bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, - const MachineOperand *fromMO, unsigned toIdx, - const MachineOperand *toMO) const; + unsigned toIdx) const; MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bd4995b..83b0490 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">; def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; +def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">; +def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">; def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">; def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">; @@ -2863,9 +2865,11 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>; +def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>; def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>; @@ -2873,10 +2877,12 @@ def VOP_I16_I32 : VOPProfile <[i16, i32, untyped, untyped]>; def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; +def VOP_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, untyped]>; def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; +def VOP_V2BF16_V2BF16_V2BF16_V2BF16 : VOPProfile <[v2bf16, v2bf16, v2bf16, v2bf16]>; def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>; def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>; @@ -2912,8 +2918,10 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>; def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>; +def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; +def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d05be8f..54fa192 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in def : ClampPat<V_MAX_F16_t16_e64, f16>; let SubtargetPredicate = UseFakeTrue16Insts in def : ClampPat<V_MAX_F16_fake16_e64, f16>; +// FIXME-TRUE16: Pseudo expansion of this won't work with True16. +let True16Predicate = UseFakeTrue16Insts in +def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>; let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -1903,6 +1906,13 @@ def : GCNPat < >; } +let SubtargetPredicate = HasBF16PackedInsts in { +def : GCNPat < + (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))), + (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0, + $src0_modifiers, $src0, DSTCLAMP.ENABLE) +>; +} // End SubtargetPredicate = HasBF16PackedInsts /********** ================================ **********/ /********** Floating point absolute/negative **********/ diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index f0be204..9a1448f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm; - if (!MFMAVGPRForm && ST.hasGFX90AInsts() && - ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && - !mayUseAGPRs(F)) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + MayNeedAGPRs = ST.hasMAIInsts(); + if (ST.hasGFX90AInsts()) { + // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection + // should be separated from availability of AGPRs + if (MFMAVGPRForm || + (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && + !mayUseAGPRs(F))) + MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + } if (AMDGPU::isChainCC(CC)) { // Chain functions don't receive an SP from their caller, but are free to diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060..0e8a420 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -704,16 +704,16 @@ void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning)); } -/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. -/// If this tag isn't present, or if it has no meaningful values, returns \p -/// Default. Otherwise returns all the address spaces concerned by the MMRA. -static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, - SIAtomicAddrSpace Default) { - static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; +/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA. +/// If this tag isn't present, or if it has no meaningful values, returns +/// \p none, otherwise returns the address spaces specified by the MD. +static std::optional<SIAtomicAddrSpace> +getSynchronizeAddrSpaceMD(const MachineInstr &MI) { + static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as"; auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); if (!MMRA) - return Default; + return std::nullopt; SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; for (const auto &[Prefix, Suffix] : MMRA) { @@ -726,7 +726,10 @@ static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, diagnoseUnknownMMRAASName(MI, Suffix); } - return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; + if (Result == SIAtomicAddrSpace::NONE) + return std::nullopt; + + return Result; } } // end anonymous namespace @@ -903,12 +906,19 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = *ScopeOrNone; - if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || - ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { + if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) { + // We currently expect refineOrderingAS to be the only place that + // can refine the AS ordered by the fence. + // If that changes, we need to review the semantics of that function + // in case it needs to preserve certain address spaces. reportUnsupported(MI, "Unsupported atomic address space"); return std::nullopt; } + auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI); + if (SynchronizeAS) + OrderingAddrSpace = *SynchronizeAS; + return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); } @@ -2687,11 +2697,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, AtomicPseudoMIs.push_back(MI); bool Changed = false; - // Refine fenced address space based on MMRAs. - // - // TODO: Should we support this MMRA on other atomic operations? - auto OrderingAddrSpace = - getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); + const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace(); if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 0039d2f..218841d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> let TSFlags{2} = HasVGPR; let TSFlags{3} = HasAGPR; let TSFlags{4} = HasSGPR; + + // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) + // to decide which registers to try to assign first. Usually, this RegisterClass priority is given + // very high priority, if not the highest priority, when considering which VirtReg to allocate next. + // + // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to + // assign more constrained RegisterClasses first. As a result, we prioritize register classes with + // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). + // + // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs. + // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained + // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the + // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor + // is used for scaling of the bit (i.e. 1 << 4). + field int BaseClassPriority = 1; + field int BaseClassScaleFactor = 16; + } multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, @@ -575,7 +592,7 @@ let HasVGPR = 1 in { def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 255), (sequence "VGPR%u_HI16", 0, 255)))> { - let AllocationPriority = 2; + let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; @@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types // Identical to VGPR_32 except it only contains the low 128 (Lo128) registers. def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 127))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // AccVGPR 32-bit registers def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, (add (sequence "AGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = !sub(numRegs, 1); + + // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the + // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result + // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for + // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one + // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, + // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. + defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); + + let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor)); let Weight = numRegs; } // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let HasVGPR = 1 in { + let HasVGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, regList> { let BaseClassOrder = !mul(numRegs, 32); @@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; + let BaseClassPriority = 0; let Size = 32; } } // End GeneratePressureSet = 0 @@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3 // aligned base register. multiclass AVRegClass<int numRegs, list<ValueType> regTypes, dag vregList, dag aregList> { - let HasVGPR = 1, HasAGPR = 1 in { + let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in { // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 38cc51b..4bda51d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -856,9 +856,9 @@ def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>; def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), (prefetch node:$ptr, node:$rw, node:$loc, node:$type), - [{ return !N->getOperand(1)->isDivergent();}]> { + [{ return !N->getOperand(1)->isDivergent() && Subtarget->hasSafeSmemPrefetch();}]> { let GISelPredicateCode = [{ - return isInstrUniform(MI); + return isInstrUniform(MI) && Subtarget->hasSafeSmemPrefetch(); }]; } @@ -1152,6 +1152,7 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> { } defm : SMPrefetchPat<"INST", i32imm_zero>; +let AddedComplexity = 12 in // Prefer scalar prefetch over global for r/o case. defm : SMPrefetchPat<"DATA", i32imm_one>; let SubtargetPredicate = isGFX12Plus in { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 030a6e1..550ec9d 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -925,6 +925,17 @@ let isAdd = 1 in { defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">; } +let isReMaterializable = 1 in { +let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in { +defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>; +// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable. +let isCommutable = 0 in +defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>; +} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in +defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag<mul>>; +} // End isReMaterializable = 1 + } // End isCommutable = 1 // These are special and do not read the exec mask. @@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName, VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>, VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>; +multiclass VOP2_Real_NO_DPP<GFXGen Gen, bits<6> op> : + VOP2_Real_e32<Gen, op>, VOP2_Real_e64<Gen, op>; + multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName, string asmName> { defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>, @@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>; defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>; defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>; +defm V_ADD_U64 : VOP2_Real_FULL<GFX1250Gen, 0x28>; +defm V_SUB_U64 : VOP2_Real_FULL<GFX1250Gen, 0x29>; +defm V_MUL_U64 : VOP2_Real_NO_DPP<GFX1250Gen, 0x2a>; //===----------------------------------------------------------------------===// // GFX11. diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index a259e5c..95fcd4a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> { bit UseTiedOutput = useTiedOutput; + defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret; + defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret; + defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret; + dag srcs = - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + (ins FP16InputMods:$src0_modifiers, Src0RC:$src0, + FP16InputMods:$src1_modifiers, Src1RC:$src1, + FP16InputMods:$src2_modifiers, Src2RC:$src2); dag dpp_srcs = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + FP16InputMods:$src2_modifiers, Src2RC:$src2); // FIXME: Clamp0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -144,48 +148,59 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>; def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>; } // End SubtargetPredicate = HasVOP3PInsts -let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in { +let isCommutable = 1, FPDPRounding = 1 in { +let SubtargetPredicate = HasMin3Max3PKF16 in { +defm V_PK_MIN3_NUM_F16 : VOP3PInst<"v_pk_min3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmin3>; +defm V_PK_MAX3_NUM_F16 : VOP3PInst<"v_pk_max3_num_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmax3>; +} + +let SubtargetPredicate = HasMinimum3Maximum3PKF16 in { defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>; defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>; } +} // End isCommutable = 1, FPDPRounding = 1 // TODO: Make sure we're doing the right thing with denormals. Note // that FMA and MAD will differ. multiclass MadFmaMixPats<SDPatternOperator fma_like, Instruction mix_inst, Instruction mixlo_inst, - Instruction mixhi_inst> { + Instruction mixhi_inst, + ValueType VT = f16, + ValueType vecVT = v2f16> { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < (AMDGPUclamp (build_vector - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), - (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0, $hi_src1_modifiers, $hi_src1, $hi_src2_modifiers, $hi_src2, DSTCLAMP.ENABLE, @@ -197,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), @@ -207,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), DSTCLAMP.NONE, @@ -217,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, >; def : GCNPat < - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -234,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = p in { def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, @@ -246,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, @@ -261,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, let True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1), - (v2f16 (mixlo_inst $src0_modifiers, $src0, + (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1), + (vecVT (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16))) + (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16))) >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; } // end True16Predicate } @@ -353,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; } +let SubtargetPredicate = HasFmaMixBF16Insts in { +let isCommutable = 1 in { + +let isReMaterializable = 1 in +defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>; + +let FPDPRounding = 1 in { +defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; + +let ClampLo = 0, ClampHi = 1 in { +defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>; +} +} // End FPDPRounding = 1 +} // End isCommutable = 1 + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>; +} // End SubtargetPredicate = HasFmaMixBF16Insts + def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> { let HasModifiers = 0; } @@ -1196,6 +1229,20 @@ let isCommutable = 1, isReMaterializable = 1 in { let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; + + let SubtargetPredicate = HasBF16PackedInsts in { + defm V_PK_ADD_BF16 : VOP3PInst<"v_pk_add_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fadd>; + defm V_PK_MUL_BF16 : VOP3PInst<"v_pk_mul_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fmul>; + defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>; + defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>; + defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>; + + // Scalar pseudo used to emulate AMDGPUClamp. + // Expanded to V_PK_MAX_NUM_BF16 with unused high half. + // FIXME-TRUE16: Pseudo expansion of this won't work with True16. + let True16Predicate = UseFakeTrue16Insts in + defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>; + } } // End isCommutable = 1, isReMaterializable = 1 def : AMDGPUMnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; @@ -2210,6 +2257,10 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op, defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; +defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>; +defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>; +defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>; + defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>; defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>; defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>; @@ -2218,6 +2269,22 @@ defm V_PK_MAX3_I16 : VOP3P_Real_gfx1250<0x2f>; defm V_PK_MAX3_U16 : VOP3P_Real_gfx1250<0x30>; defm V_PK_MIN3_I16 : VOP3P_Real_gfx1250<0x31>; defm V_PK_MIN3_U16 : VOP3P_Real_gfx1250<0x32>; +defm V_PK_FMA_BF16 : VOP3P_Real_gfx1250<0x11>; +defm V_PK_ADD_BF16 : VOP3P_Real_gfx1250<0x23>; +defm V_PK_MUL_BF16 : VOP3P_Real_gfx1250<0x2a>; +defm V_PK_MIN_NUM_BF16 : VOP3P_Real_gfx1250<0x2b>; +defm V_PK_MAX_NUM_BF16 : VOP3P_Real_gfx1250<0x2c>; +defm V_PK_MINIMUM3_F16 : VOP3P_Real_gfx1250<0x36>; +defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; +defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; +defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; + +defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>; +defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3e>; +defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3f>; + +let AssemblerPredicate = isGFX1250Plus in +def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16", "v_fma_mix_f32">; defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8b7f06a..fca5dff 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -20347,6 +20347,13 @@ ARMTargetLowering::getSingleConstraintMatchWeight( return weight; } +static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) { + if (PR == 0 || VT == MVT::Other) + return false; + return (ARM::SPRRegClass.contains(PR) && VT != MVT::f32 && VT != MVT::i32) || + (ARM::DPRRegClass.contains(PR) && VT != MVT::f64); +} + using RCPair = std::pair<unsigned, const TargetRegisterClass *>; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( @@ -20420,7 +20427,10 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( if (StringRef("{cc}").equals_insensitive(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + if (isIncompatibleReg(RCP.first, VT)) + return {0, nullptr}; + return RCP; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index ec6f4e2..ece6c10 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -12327,7 +12327,7 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) { } assert(Section && "must have section to emit alignment"); - if (Section->useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(*Section)) getStreamer().emitCodeAlignment(Align(2), &getSTI()); else getStreamer().emitValueToAlignment(Align(2)); @@ -12525,7 +12525,7 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) { // '.align' is target specifically handled to mean 2**2 byte alignment. const MCSection *Section = getStreamer().getCurrentSectionOnly(); assert(Section && "must have section to emit alignment"); - if (Section->useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(*Section)) getStreamer().emitCodeAlignment(Align(4), &getSTI(), 0); else getStreamer().emitValueToAlignment(Align(4), 0, 1, 0); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index a7a9911..868556b 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -708,8 +708,6 @@ private: void SwitchToExTabSection(const MCSymbol &FnStart); void SwitchToExIdxSection(const MCSymbol &FnStart); - void EmitFixup(const MCExpr *Expr, MCFixupKind Kind); - bool IsThumb; bool IsAndroid; @@ -1096,8 +1094,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { } void ARMTargetELFStreamer::annotateTLSDescriptorSequence( - const MCSymbolRefExpr *S) { - getStreamer().EmitFixup(S, FK_Data_4); + const MCSymbolRefExpr *Expr) { + getStreamer().addFixup(Expr, FK_Data_4); } void ARMTargetELFStreamer::emitCode16() { getStreamer().setIsThumb(true); } @@ -1140,7 +1138,8 @@ void ARMTargetELFStreamer::finish() { MCContext &Ctx = getContext(); auto &Asm = getStreamer().getAssembler(); if (any_of(Asm, [](const MCSection &Sec) { - return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_ARM_PURECODE; + return static_cast<const MCSectionELF &>(Sec).getFlags() & + ELF::SHF_ARM_PURECODE; })) { auto *Text = static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); @@ -1206,11 +1205,6 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { SectionKind::getData(), FnStart); } -void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { - MCFragment *Frag = getCurrentFragment(); - Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind)); -} - void ARMELFStreamer::EHReset() { ExTab = nullptr; FnStart = nullptr; diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp index ad8aa571..0fb33cd 100644 --- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp @@ -260,7 +260,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) { continue; } - auto *Section = cast<MCSectionELF>(TLOF.SectionForGlobal(&GO, TM)); + auto *Section = static_cast<MCSectionELF *>(TLOF.SectionForGlobal(&GO, TM)); if (Section->getName().starts_with(".data")) NeedsCopyData = true; else if (Section->getName().starts_with(".rodata") && SubTM->hasLPM()) diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp index c2c1bb4..0615ec9 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp @@ -24,8 +24,6 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) { CalleeSaveStackSlotSize = 2; CommentString = ";"; SeparatorString = "$"; - PrivateGlobalPrefix = ".L"; - PrivateLabelPrefix = ".L"; UsesELFSectionDirectiveForBSS = true; SupportsDebugInformation = true; } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h index fab2713..1915fa8 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h @@ -14,7 +14,7 @@ #define LLVM_AVR_ASM_INFO_H #include "MCTargetDesc/AVRMCExpr.h" -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" #include "llvm/MC/MCExpr.h" namespace llvm { @@ -22,7 +22,7 @@ namespace llvm { class Triple; /// Specifies the format of AVR assembly files. -class AVRMCAsmInfo : public MCAsmInfo { +class AVRMCAsmInfo : public MCAsmInfoELF { public: explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options); void printSpecifierExpr(raw_ostream &OS, diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp index 5963976..6ec78d0 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp @@ -7,12 +7,10 @@ //===----------------------------------------------------------------------===// #include "AVRMCExpr.h" -#include "MCTargetDesc/AVRMCAsmInfo.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCValue.h" namespace llvm { diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index 1e29a0f..a87b9a2 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -1255,10 +1255,8 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) { FuncInfo.Label = FuncLabel; FuncInfo.TypeId = FuncTypeId; if (FuncLabel->isInSection()) { - MCSection &Section = FuncLabel->getSection(); - const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section); - assert(SectionELF && "Null section for Function Label"); - SecNameOff = addString(SectionELF->getName()); + auto &Sec = static_cast<const MCSectionELF &>(FuncLabel->getSection()); + SecNameOff = addString(Sec.getName()); } else { SecNameOff = addString(".text"); } diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 827e928..bb74f6a 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -54,11 +54,8 @@ unsigned BPFELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCSymbol &Sym = *A; if (Sym.isDefined()) { - MCSection &Section = Sym.getSection(); - const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section); - assert(SectionELF && "Null section for reloc symbol"); - - unsigned Flags = SectionELF->getFlags(); + auto &Section = static_cast<const MCSectionELF &>(Sym.getSection()); + unsigned Flags = Section.getFlags(); if (Sym.isTemporary()) { // .BTF.ext generates FK_Data_4 relocations for diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp index a0011e8..fa9007e 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp @@ -16,7 +16,6 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index 7b21684..63d6e6f 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -13,18 +13,19 @@ #ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H #define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" #include "llvm/TargetParser/Triple.h" namespace llvm { -class BPFMCAsmInfo : public MCAsmInfo { +class BPFMCAsmInfo : public MCAsmInfoELF { public: explicit BPFMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) { if (TT.getArch() == Triple::bpfeb) IsLittleEndian = false; PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = "L"; WeakRefDirective = "\t.weak\t"; UsesELFSectionDirectiveForBSS = true; diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp index d9d9b36..feecfc0 100644 --- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp +++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp @@ -301,41 +301,53 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { } bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { - Value *PtrOperand = GEPI.getPointerOperand(); - Type *OrigGEPType = GEPI.getSourceElementType(); - Type *NewGEPType = OrigGEPType; + GEPOperator *GOp = cast<GEPOperator>(&GEPI); + Value *PtrOperand = GOp->getPointerOperand(); + Type *NewGEPType = GOp->getSourceElementType(); bool NeedsTransform = false; + // Unwrap GEP ConstantExprs to find the base operand and element type + while (auto *CE = dyn_cast<ConstantExpr>(PtrOperand)) { + if (auto *GEPCE = dyn_cast<GEPOperator>(CE)) { + GOp = GEPCE; + PtrOperand = GEPCE->getPointerOperand(); + NewGEPType = GEPCE->getSourceElementType(); + } else + break; + } + if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) { NewGEPType = NewGlobal->getValueType(); PtrOperand = NewGlobal; NeedsTransform = true; } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) { Type *AllocatedType = Alloca->getAllocatedType(); - // Only transform if the allocated type is an array - if (AllocatedType != OrigGEPType && isa<ArrayType>(AllocatedType)) { + if (isa<ArrayType>(AllocatedType) && + AllocatedType != GOp->getResultElementType()) { NewGEPType = AllocatedType; NeedsTransform = true; } } - // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will - // convert these scalar geps into flattened array geps - if (!isa<ArrayType>(OrigGEPType)) - NewGEPType = OrigGEPType; - - // Note: We bail if this isn't a gep touched via alloca or global - // transformations if (!NeedsTransform) return false; - IRBuilder<> Builder(&GEPI); - SmallVector<Value *, MaxVecSize> Indices(GEPI.indices()); + // Keep scalar GEPs scalar; dxil-flatten-arrays will do flattening later + if (!isa<ArrayType>(GOp->getSourceElementType())) + NewGEPType = GOp->getSourceElementType(); + IRBuilder<> Builder(&GEPI); + SmallVector<Value *, MaxVecSize> Indices(GOp->indices()); Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices, - GEPI.getName(), GEPI.getNoWrapFlags()); - GEPI.replaceAllUsesWith(NewGEP); - GEPI.eraseFromParent(); + GOp->getName(), GOp->getNoWrapFlags()); + + GOp->replaceAllUsesWith(NewGEP); + + if (auto *CE = dyn_cast<ConstantExpr>(GOp)) + CE->destroyConstant(); + else if (auto *OldGEPI = dyn_cast<GetElementPtrInst>(GOp)) + OldGEPI->eraseFromParent(); + return true; } diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp index f0e2e78..7e1436e 100644 --- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp +++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp @@ -263,8 +263,13 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) { // merge the byte offsets. Otherwise, this GEP is itself the root of a GEP // chain and we need to deterine the root array type if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) { - assert(GEPChainInfoMap.contains(PtrOpGEP) && - "Expected parent GEP to be visited before this GEP"); + + // If the parent GEP was not processed, then we do not want to process its + // descendants. This can happen if the GEP chain is for an unsupported type + // such as a struct -- we do not flatten structs nor GEP chains for structs + if (!GEPChainInfoMap.contains(PtrOpGEP)) + return false; + GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP]; Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType; Info.RootPointerOperand = PGEPInfo.RootPointerOperand; diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index c73648f..3427968 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -24,18 +24,19 @@ using namespace llvm; -static void legalizeFreeze(Instruction &I, +static bool legalizeFreeze(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *>) { auto *FI = dyn_cast<FreezeInst>(&I); if (!FI) - return; + return false; FI->replaceAllUsesWith(FI->getOperand(0)); ToRemove.push_back(FI); + return true; } -static void fixI8UseChain(Instruction &I, +static bool fixI8UseChain(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { @@ -74,19 +75,19 @@ static void fixI8UseChain(Instruction &I, if (Trunc->getDestTy()->isIntegerTy(8)) { ReplacedValues[Trunc] = Trunc->getOperand(0); ToRemove.push_back(Trunc); - return; + return true; } } if (auto *Store = dyn_cast<StoreInst>(&I)) { if (!Store->getValueOperand()->getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewStore = Builder.CreateStore(NewOperands[0], NewOperands[1]); ReplacedValues[Store] = NewStore; ToRemove.push_back(Store); - return; + return true; } if (auto *Load = dyn_cast<LoadInst>(&I); @@ -104,17 +105,17 @@ static void fixI8UseChain(Instruction &I, LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]); ReplacedValues[Load] = NewLoad; ToRemove.push_back(Load); - return; + return true; } if (auto *Load = dyn_cast<LoadInst>(&I); Load && isa<ConstantExpr>(Load->getPointerOperand())) { auto *CE = dyn_cast<ConstantExpr>(Load->getPointerOperand()); if (!(CE->getOpcode() == Instruction::GetElementPtr)) - return; + return false; auto *GEP = dyn_cast<GEPOperator>(CE); if (!GEP->getSourceElementType()->isIntegerTy(8)) - return; + return false; Type *ElementType = Load->getType(); ConstantInt *Offset = dyn_cast<ConstantInt>(GEP->getOperand(1)); @@ -143,12 +144,12 @@ static void fixI8UseChain(Instruction &I, ReplacedValues[Load] = NewLoad; Load->replaceAllUsesWith(NewLoad); ToRemove.push_back(Load); - return; + return true; } if (auto *BO = dyn_cast<BinaryOperator>(&I)) { if (!I.getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewInst = @@ -162,24 +163,24 @@ static void fixI8UseChain(Instruction &I, } ReplacedValues[BO] = NewInst; ToRemove.push_back(BO); - return; + return true; } if (auto *Sel = dyn_cast<SelectInst>(&I)) { if (!I.getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewInst = Builder.CreateSelect(Sel->getCondition(), NewOperands[1], NewOperands[2]); ReplacedValues[Sel] = NewInst; ToRemove.push_back(Sel); - return; + return true; } if (auto *Cmp = dyn_cast<CmpInst>(&I)) { if (!Cmp->getOperand(0)->getType()->isIntegerTy(8)) - return; + return false; SmallVector<Value *> NewOperands; ProcessOperands(NewOperands); Value *NewInst = @@ -187,18 +188,18 @@ static void fixI8UseChain(Instruction &I, Cmp->replaceAllUsesWith(NewInst); ReplacedValues[Cmp] = NewInst; ToRemove.push_back(Cmp); - return; + return true; } if (auto *Cast = dyn_cast<CastInst>(&I)) { if (!Cast->getSrcTy()->isIntegerTy(8)) - return; + return false; ToRemove.push_back(Cast); auto *Replacement = ReplacedValues[Cast->getOperand(0)]; if (Cast->getType() == Replacement->getType()) { Cast->replaceAllUsesWith(Replacement); - return; + return true; } Value *AdjustedCast = nullptr; @@ -213,7 +214,7 @@ static void fixI8UseChain(Instruction &I, if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { if (!GEP->getType()->isPointerTy() || !GEP->getSourceElementType()->isIntegerTy(8)) - return; + return false; Value *BasePtr = GEP->getPointerOperand(); if (ReplacedValues.count(BasePtr)) @@ -248,15 +249,17 @@ static void fixI8UseChain(Instruction &I, ReplacedValues[GEP] = NewGEP; GEP->replaceAllUsesWith(NewGEP); ToRemove.push_back(GEP); + return true; } + return false; } -static void upcastI8AllocasAndUses(Instruction &I, +static bool upcastI8AllocasAndUses(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { auto *AI = dyn_cast<AllocaInst>(&I); if (!AI || !AI->getAllocatedType()->isIntegerTy(8)) - return; + return false; Type *SmallestType = nullptr; @@ -291,16 +294,17 @@ static void upcastI8AllocasAndUses(Instruction &I, } if (!SmallestType) - return; // no valid casts found + return false; // no valid casts found // Replace alloca IRBuilder<> Builder(AI); auto *NewAlloca = Builder.CreateAlloca(SmallestType); ReplacedValues[AI] = NewAlloca; ToRemove.push_back(AI); + return true; } -static void +static bool downcastI64toI32InsertExtractElements(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { @@ -318,6 +322,7 @@ downcastI64toI32InsertExtractElements(Instruction &I, Extract->replaceAllUsesWith(NewExtract); ToRemove.push_back(Extract); + return true; } } @@ -335,8 +340,10 @@ downcastI64toI32InsertExtractElements(Instruction &I, Insert->replaceAllUsesWith(Insert32Index); ToRemove.push_back(Insert); + return true; } } + return false; } static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, @@ -453,17 +460,17 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val, // Expands the instruction `I` into corresponding loads and stores if it is a // memcpy call. In that case, the call instruction is added to the `ToRemove` // vector. `ReplacedValues` is unused. -static void legalizeMemCpy(Instruction &I, +static bool legalizeMemCpy(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { CallInst *CI = dyn_cast<CallInst>(&I); if (!CI) - return; + return false; Intrinsic::ID ID = CI->getIntrinsicID(); if (ID != Intrinsic::memcpy) - return; + return false; IRBuilder<> Builder(&I); Value *Dst = CI->getArgOperand(0); @@ -476,19 +483,20 @@ static void legalizeMemCpy(Instruction &I, assert(IsVolatile->getZExtValue() == 0 && "Expected IsVolatile to be false"); emitMemcpyExpansion(Builder, Dst, Src, Length); ToRemove.push_back(CI); + return true; } -static void legalizeMemSet(Instruction &I, +static bool legalizeMemSet(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { CallInst *CI = dyn_cast<CallInst>(&I); if (!CI) - return; + return false; Intrinsic::ID ID = CI->getIntrinsicID(); if (ID != Intrinsic::memset) - return; + return false; IRBuilder<> Builder(&I); Value *Dst = CI->getArgOperand(0); @@ -497,23 +505,25 @@ static void legalizeMemSet(Instruction &I, assert(Size && "Expected Size to be a ConstantInt"); emitMemsetExpansion(Builder, Dst, Val, Size, ReplacedValues); ToRemove.push_back(CI); + return true; } -static void updateFnegToFsub(Instruction &I, +static bool updateFnegToFsub(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { const Intrinsic::ID ID = I.getOpcode(); if (ID != Instruction::FNeg) - return; + return false; IRBuilder<> Builder(&I); Value *In = I.getOperand(0); Value *Zero = ConstantFP::get(In->getType(), -0.0); I.replaceAllUsesWith(Builder.CreateFSub(Zero, In)); ToRemove.push_back(&I); + return true; } -static void +static bool legalizeGetHighLowi64Bytes(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &ReplacedValues) { @@ -523,13 +533,13 @@ legalizeGetHighLowi64Bytes(Instruction &I, BitCast->getSrcTy()->isIntegerTy(64)) { ToRemove.push_back(BitCast); ReplacedValues[BitCast] = BitCast->getOperand(0); - return; + return true; } } if (auto *Extract = dyn_cast<ExtractElementInst>(&I)) { if (!dyn_cast<BitCastInst>(Extract->getVectorOperand())) - return; + return false; auto *VecTy = dyn_cast<FixedVectorType>(Extract->getVectorOperandType()); if (VecTy && VecTy->getElementType()->isIntegerTy(32) && VecTy->getNumElements() == 2) { @@ -557,12 +567,14 @@ legalizeGetHighLowi64Bytes(Instruction &I, } ToRemove.push_back(Extract); Extract->replaceAllUsesWith(ReplacedValues[Extract]); + return true; } } } + return false; } -static void +static bool legalizeScalarLoadStoreOnArrays(Instruction &I, SmallVectorImpl<Instruction *> &ToRemove, DenseMap<Value *, Value *> &) { @@ -579,14 +591,14 @@ legalizeScalarLoadStoreOnArrays(Instruction &I, PtrOpIndex = SI->getPointerOperandIndex(); LoadStoreTy = SI->getValueOperand()->getType(); } else - return; + return false; // If the load/store is not of a single-value type (i.e., scalar or vector) // then we do not modify it. It shouldn't be a vector either because the // dxil-data-scalarization pass is expected to run before this, but it's not // incorrect to apply this transformation to vector load/stores. if (!LoadStoreTy->isSingleValueType()) - return; + return false; Type *ArrayTy; if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp)) @@ -594,10 +606,10 @@ legalizeScalarLoadStoreOnArrays(Instruction &I, else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp)) ArrayTy = AllocaPtrOp->getAllocatedType(); else - return; + return false; if (!isa<ArrayType>(ArrayTy)) - return; + return false; assert(ArrayTy->getArrayElementType() == LoadStoreTy && "Expected array element type to be the same as to the scalar load or " @@ -607,6 +619,7 @@ legalizeScalarLoadStoreOnArrays(Instruction &I, Value *GEP = GetElementPtrInst::Create( ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator()); I.setOperand(PtrOpIndex, GEP); + return true; } namespace { @@ -624,13 +637,11 @@ public: ReplacedValues.clear(); for (auto &I : instructions(F)) { for (auto &LegalizationFn : LegalizationPipeline[Stage]) - LegalizationFn(I, ToRemove, ReplacedValues); + MadeChange |= LegalizationFn(I, ToRemove, ReplacedValues); } for (auto *Inst : reverse(ToRemove)) Inst->eraseFromParent(); - - MadeChange |= !ToRemove.empty(); } return MadeChange; } @@ -639,7 +650,7 @@ private: enum LegalizationStage { Stage1 = 0, Stage2 = 1, NumStages }; using LegalizationFnTy = - std::function<void(Instruction &, SmallVectorImpl<Instruction *> &, + std::function<bool(Instruction &, SmallVectorImpl<Instruction *> &, DenseMap<Value *, Value *> &)>; SmallVector<LegalizationFnTy> LegalizationPipeline[NumStages]; diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 566f3a9..c33ec0e 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -241,7 +241,6 @@ static void replaceAccess(IntrinsicInst *II, dxil::ResourceTypeInfo &RTI) { } static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) { - bool Changed = false; SmallVector<std::pair<IntrinsicInst *, dxil::ResourceTypeInfo>> Resources; for (BasicBlock &BB : F) for (Instruction &I : BB) @@ -254,7 +253,7 @@ static bool transformResourcePointers(Function &F, DXILResourceTypeMap &DRTM) { for (auto &[II, RI] : Resources) replaceAccess(II, RI); - return Changed; + return !Resources.empty(); } PreservedAnalyses DXILResourceAccess::run(Function &F, diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index dfc8162..ebdfcaa 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Frontend/HLSL/RootSignatureMetadata.h" #include "llvm/Frontend/HLSL/RootSignatureValidations.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" @@ -29,25 +30,10 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <cstdint> -#include <optional> -#include <utility> using namespace llvm; using namespace llvm::dxil; -static bool reportError(LLVMContext *Ctx, Twine Message, - DiagnosticSeverity Severity = DS_Error) { - Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); - return true; -} - -static bool reportValueError(LLVMContext *Ctx, Twine ParamName, - uint32_t Value) { - Ctx->diagnose(DiagnosticInfoGeneric( - "Invalid value for " + ParamName + ": " + Twine(Value), DS_Error)); - return true; -} - static std::optional<uint32_t> extractMdIntValue(MDNode *Node, unsigned int OpId) { if (auto *CI = @@ -56,453 +42,10 @@ static std::optional<uint32_t> extractMdIntValue(MDNode *Node, return std::nullopt; } -static std::optional<float> extractMdFloatValue(MDNode *Node, - unsigned int OpId) { - if (auto *CI = mdconst::dyn_extract<ConstantFP>(Node->getOperand(OpId).get())) - return CI->getValueAPF().convertToFloat(); - return std::nullopt; -} - -static std::optional<StringRef> extractMdStringValue(MDNode *Node, - unsigned int OpId) { - MDString *NodeText = dyn_cast<MDString>(Node->getOperand(OpId)); - if (NodeText == nullptr) - return std::nullopt; - return NodeText->getString(); -} - -static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *RootFlagNode) { - - if (RootFlagNode->getNumOperands() != 2) - return reportError(Ctx, "Invalid format for RootFlag Element"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1)) - RSD.Flags = *Val; - else - return reportError(Ctx, "Invalid value for RootFlag"); - - return false; -} - -static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *RootConstantNode) { - - if (RootConstantNode->getNumOperands() != 5) - return reportError(Ctx, "Invalid format for RootConstants Element"); - - dxbc::RTS0::v1::RootParameterHeader Header; - // The parameter offset doesn't matter here - we recalculate it during - // serialization Header.ParameterOffset = 0; - Header.ParameterType = - llvm::to_underlying(dxbc::RootParameterType::Constants32Bit); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 1)) - Header.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - dxbc::RTS0::v1::RootConstants Constants; - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2)) - Constants.ShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for ShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3)) - Constants.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4)) - Constants.Num32BitValues = *Val; - else - return reportError(Ctx, "Invalid value for Num32BitValues"); - - RSD.ParametersContainer.addParameter(Header, Constants); - - return false; -} - -static bool parseRootDescriptors(LLVMContext *Ctx, - mcdxbc::RootSignatureDesc &RSD, - MDNode *RootDescriptorNode, - RootSignatureElementKind ElementKind) { - assert(ElementKind == RootSignatureElementKind::SRV || - ElementKind == RootSignatureElementKind::UAV || - ElementKind == RootSignatureElementKind::CBV && - "parseRootDescriptors should only be called with RootDescriptor " - "element kind."); - if (RootDescriptorNode->getNumOperands() != 5) - return reportError(Ctx, "Invalid format for Root Descriptor Element"); - - dxbc::RTS0::v1::RootParameterHeader Header; - switch (ElementKind) { - case RootSignatureElementKind::SRV: - Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::SRV); - break; - case RootSignatureElementKind::UAV: - Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::UAV); - break; - case RootSignatureElementKind::CBV: - Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::CBV); - break; - default: - llvm_unreachable("invalid Root Descriptor kind"); - break; - } - - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 1)) - Header.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - dxbc::RTS0::v2::RootDescriptor Descriptor; - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2)) - Descriptor.ShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for ShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3)) - Descriptor.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (RSD.Version == 1) { - RSD.ParametersContainer.addParameter(Header, Descriptor); - return false; - } - assert(RSD.Version > 1); - - if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4)) - Descriptor.Flags = *Val; - else - return reportError(Ctx, "Invalid value for Root Descriptor Flags"); - - RSD.ParametersContainer.addParameter(Header, Descriptor); - return false; -} - -static bool parseDescriptorRange(LLVMContext *Ctx, - mcdxbc::DescriptorTable &Table, - MDNode *RangeDescriptorNode) { - - if (RangeDescriptorNode->getNumOperands() != 6) - return reportError(Ctx, "Invalid format for Descriptor Range"); - - dxbc::RTS0::v2::DescriptorRange Range; - - std::optional<StringRef> ElementText = - extractMdStringValue(RangeDescriptorNode, 0); - - if (!ElementText.has_value()) - return reportError(Ctx, "Descriptor Range, first element is not a string."); - - Range.RangeType = - StringSwitch<uint32_t>(*ElementText) - .Case("CBV", llvm::to_underlying(dxbc::DescriptorRangeType::CBV)) - .Case("SRV", llvm::to_underlying(dxbc::DescriptorRangeType::SRV)) - .Case("UAV", llvm::to_underlying(dxbc::DescriptorRangeType::UAV)) - .Case("Sampler", - llvm::to_underlying(dxbc::DescriptorRangeType::Sampler)) - .Default(~0U); - - if (Range.RangeType == ~0U) - return reportError(Ctx, "Invalid Descriptor Range type: " + *ElementText); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1)) - Range.NumDescriptors = *Val; - else - return reportError(Ctx, "Invalid value for Number of Descriptor in Range"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2)) - Range.BaseShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for BaseShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3)) - Range.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4)) - Range.OffsetInDescriptorsFromTableStart = *Val; - else - return reportError(Ctx, - "Invalid value for OffsetInDescriptorsFromTableStart"); - - if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5)) - Range.Flags = *Val; - else - return reportError(Ctx, "Invalid value for Descriptor Range Flags"); - - Table.Ranges.push_back(Range); - return false; -} - -static bool parseDescriptorTable(LLVMContext *Ctx, - mcdxbc::RootSignatureDesc &RSD, - MDNode *DescriptorTableNode) { - const unsigned int NumOperands = DescriptorTableNode->getNumOperands(); - if (NumOperands < 2) - return reportError(Ctx, "Invalid format for Descriptor Table"); - - dxbc::RTS0::v1::RootParameterHeader Header; - if (std::optional<uint32_t> Val = extractMdIntValue(DescriptorTableNode, 1)) - Header.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - mcdxbc::DescriptorTable Table; - Header.ParameterType = - llvm::to_underlying(dxbc::RootParameterType::DescriptorTable); - - for (unsigned int I = 2; I < NumOperands; I++) { - MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I)); - if (Element == nullptr) - return reportError(Ctx, "Missing Root Element Metadata Node."); - - if (parseDescriptorRange(Ctx, Table, Element)) - return true; - } - - RSD.ParametersContainer.addParameter(Header, Table); - return false; -} - -static bool parseStaticSampler(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *StaticSamplerNode) { - if (StaticSamplerNode->getNumOperands() != 14) - return reportError(Ctx, "Invalid format for Static Sampler"); - - dxbc::RTS0::v1::StaticSampler Sampler; - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 1)) - Sampler.Filter = *Val; - else - return reportError(Ctx, "Invalid value for Filter"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 2)) - Sampler.AddressU = *Val; - else - return reportError(Ctx, "Invalid value for AddressU"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 3)) - Sampler.AddressV = *Val; - else - return reportError(Ctx, "Invalid value for AddressV"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 4)) - Sampler.AddressW = *Val; - else - return reportError(Ctx, "Invalid value for AddressW"); - - if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5)) - Sampler.MipLODBias = *Val; - else - return reportError(Ctx, "Invalid value for MipLODBias"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6)) - Sampler.MaxAnisotropy = *Val; - else - return reportError(Ctx, "Invalid value for MaxAnisotropy"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 7)) - Sampler.ComparisonFunc = *Val; - else - return reportError(Ctx, "Invalid value for ComparisonFunc "); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 8)) - Sampler.BorderColor = *Val; - else - return reportError(Ctx, "Invalid value for ComparisonFunc "); - - if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9)) - Sampler.MinLOD = *Val; - else - return reportError(Ctx, "Invalid value for MinLOD"); - - if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10)) - Sampler.MaxLOD = *Val; - else - return reportError(Ctx, "Invalid value for MaxLOD"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11)) - Sampler.ShaderRegister = *Val; - else - return reportError(Ctx, "Invalid value for ShaderRegister"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12)) - Sampler.RegisterSpace = *Val; - else - return reportError(Ctx, "Invalid value for RegisterSpace"); - - if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 13)) - Sampler.ShaderVisibility = *Val; - else - return reportError(Ctx, "Invalid value for ShaderVisibility"); - - RSD.StaticSamplers.push_back(Sampler); - return false; -} - -static bool parseRootSignatureElement(LLVMContext *Ctx, - mcdxbc::RootSignatureDesc &RSD, - MDNode *Element) { - std::optional<StringRef> ElementText = extractMdStringValue(Element, 0); - if (!ElementText.has_value()) - return reportError(Ctx, "Invalid format for Root Element"); - - RootSignatureElementKind ElementKind = - StringSwitch<RootSignatureElementKind>(*ElementText) - .Case("RootFlags", RootSignatureElementKind::RootFlags) - .Case("RootConstants", RootSignatureElementKind::RootConstants) - .Case("RootCBV", RootSignatureElementKind::CBV) - .Case("RootSRV", RootSignatureElementKind::SRV) - .Case("RootUAV", RootSignatureElementKind::UAV) - .Case("DescriptorTable", RootSignatureElementKind::DescriptorTable) - .Case("StaticSampler", RootSignatureElementKind::StaticSamplers) - .Default(RootSignatureElementKind::Error); - - switch (ElementKind) { - - case RootSignatureElementKind::RootFlags: - return parseRootFlags(Ctx, RSD, Element); - case RootSignatureElementKind::RootConstants: - return parseRootConstants(Ctx, RSD, Element); - case RootSignatureElementKind::CBV: - case RootSignatureElementKind::SRV: - case RootSignatureElementKind::UAV: - return parseRootDescriptors(Ctx, RSD, Element, ElementKind); - case RootSignatureElementKind::DescriptorTable: - return parseDescriptorTable(Ctx, RSD, Element); - case RootSignatureElementKind::StaticSamplers: - return parseStaticSampler(Ctx, RSD, Element); - case RootSignatureElementKind::Error: - return reportError(Ctx, "Invalid Root Signature Element: " + *ElementText); - } - - llvm_unreachable("Unhandled RootSignatureElementKind enum."); -} - -static bool parse(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, - MDNode *Node) { - bool HasError = false; - - // Loop through the Root Elements of the root signature. - for (const auto &Operand : Node->operands()) { - MDNode *Element = dyn_cast<MDNode>(Operand); - if (Element == nullptr) - return reportError(Ctx, "Missing Root Element Metadata Node."); - - HasError = HasError || parseRootSignatureElement(Ctx, RSD, Element); - } - - return HasError; -} - -static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { - - if (!llvm::hlsl::rootsig::verifyVersion(RSD.Version)) { - return reportValueError(Ctx, "Version", RSD.Version); - } - - if (!llvm::hlsl::rootsig::verifyRootFlag(RSD.Flags)) { - return reportValueError(Ctx, "RootFlags", RSD.Flags); - } - - for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) { - if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility)) - return reportValueError(Ctx, "ShaderVisibility", - Info.Header.ShaderVisibility); - - assert(dxbc::isValidParameterType(Info.Header.ParameterType) && - "Invalid value for ParameterType"); - - switch (Info.Header.ParameterType) { - - case llvm::to_underlying(dxbc::RootParameterType::CBV): - case llvm::to_underlying(dxbc::RootParameterType::UAV): - case llvm::to_underlying(dxbc::RootParameterType::SRV): { - const dxbc::RTS0::v2::RootDescriptor &Descriptor = - RSD.ParametersContainer.getRootDescriptor(Info.Location); - if (!llvm::hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister)) - return reportValueError(Ctx, "ShaderRegister", - Descriptor.ShaderRegister); - - if (!llvm::hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace)) - return reportValueError(Ctx, "RegisterSpace", Descriptor.RegisterSpace); - - if (RSD.Version > 1) { - if (!llvm::hlsl::rootsig::verifyRootDescriptorFlag(RSD.Version, - Descriptor.Flags)) - return reportValueError(Ctx, "RootDescriptorFlag", Descriptor.Flags); - } - break; - } - case llvm::to_underlying(dxbc::RootParameterType::DescriptorTable): { - const mcdxbc::DescriptorTable &Table = - RSD.ParametersContainer.getDescriptorTable(Info.Location); - for (const dxbc::RTS0::v2::DescriptorRange &Range : Table) { - if (!llvm::hlsl::rootsig::verifyRangeType(Range.RangeType)) - return reportValueError(Ctx, "RangeType", Range.RangeType); - - if (!llvm::hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace)) - return reportValueError(Ctx, "RegisterSpace", Range.RegisterSpace); - - if (!llvm::hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors)) - return reportValueError(Ctx, "NumDescriptors", Range.NumDescriptors); - - if (!llvm::hlsl::rootsig::verifyDescriptorRangeFlag( - RSD.Version, Range.RangeType, Range.Flags)) - return reportValueError(Ctx, "DescriptorFlag", Range.Flags); - } - break; - } - } - } - - for (const dxbc::RTS0::v1::StaticSampler &Sampler : RSD.StaticSamplers) { - if (!llvm::hlsl::rootsig::verifySamplerFilter(Sampler.Filter)) - return reportValueError(Ctx, "Filter", Sampler.Filter); - - if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressU)) - return reportValueError(Ctx, "AddressU", Sampler.AddressU); - - if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressV)) - return reportValueError(Ctx, "AddressV", Sampler.AddressV); - - if (!llvm::hlsl::rootsig::verifyAddress(Sampler.AddressW)) - return reportValueError(Ctx, "AddressW", Sampler.AddressW); - - if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias)) - return reportValueError(Ctx, "MipLODBias", Sampler.MipLODBias); - - if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy)) - return reportValueError(Ctx, "MaxAnisotropy", Sampler.MaxAnisotropy); - - if (!llvm::hlsl::rootsig::verifyComparisonFunc(Sampler.ComparisonFunc)) - return reportValueError(Ctx, "ComparisonFunc", Sampler.ComparisonFunc); - - if (!llvm::hlsl::rootsig::verifyBorderColor(Sampler.BorderColor)) - return reportValueError(Ctx, "BorderColor", Sampler.BorderColor); - - if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MinLOD)) - return reportValueError(Ctx, "MinLOD", Sampler.MinLOD); - - if (!llvm::hlsl::rootsig::verifyLOD(Sampler.MaxLOD)) - return reportValueError(Ctx, "MaxLOD", Sampler.MaxLOD); - - if (!llvm::hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister)) - return reportValueError(Ctx, "ShaderRegister", Sampler.ShaderRegister); - - if (!llvm::hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace)) - return reportValueError(Ctx, "RegisterSpace", Sampler.RegisterSpace); - - if (!dxbc::isValidShaderVisibility(Sampler.ShaderVisibility)) - return reportValueError(Ctx, "ShaderVisibility", - Sampler.ShaderVisibility); - } - - return false; +static bool reportError(LLVMContext *Ctx, Twine Message, + DiagnosticSeverity Severity = DS_Error) { + Ctx->diagnose(DiagnosticInfoGeneric(Message, Severity)); + return true; } static SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> @@ -584,7 +127,9 @@ analyzeModule(Module &M) { // static sampler offset is calculated when writting dxcontainer. RSD.StaticSamplersOffset = 0u; - if (parse(Ctx, RSD, RootElementListNode) || validate(Ctx, RSD)) { + hlsl::rootsig::MetadataParser MDParser(RootElementListNode); + + if (MDParser.ParseRootSignature(Ctx, RSD)) { return RSDMap; } diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index fc39b38..254b7ff 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -26,17 +26,6 @@ namespace llvm { namespace dxil { -enum class RootSignatureElementKind { - Error = 0, - RootFlags = 1, - RootConstants = 2, - SRV = 3, - UAV = 4, - CBV = 5, - DescriptorTable = 6, - StaticSamplers = 7 -}; - class RootSignatureBindingInfo { private: SmallDenseMap<const Function *, mcdxbc::RootSignatureDesc> FuncToRsMap; diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index eb4adfe..e7e7f2c 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -106,11 +106,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, DXILResourceTypeMap &DRTM, const ModuleMetadataInfo &MMDI) { if (!CSF.Doubles) - CSF.Doubles = I.getType()->isDoubleTy(); + CSF.Doubles = I.getType()->getScalarType()->isDoubleTy(); if (!CSF.Doubles) { for (const Value *Op : I.operands()) { - if (Op->getType()->isDoubleTy()) { + if (Op->getType()->getScalarType()->isDoubleTy()) { CSF.Doubles = true; break; } @@ -130,12 +130,13 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, } if (!CSF.LowPrecisionPresent) - CSF.LowPrecisionPresent = - I.getType()->isIntegerTy(16) || I.getType()->isHalfTy(); + CSF.LowPrecisionPresent = I.getType()->getScalarType()->isIntegerTy(16) || + I.getType()->getScalarType()->isHalfTy(); if (!CSF.LowPrecisionPresent) { for (const Value *Op : I.operands()) { - if (Op->getType()->isIntegerTy(16) || Op->getType()->isHalfTy()) { + if (Op->getType()->getScalarType()->isIntegerTy(16) || + Op->getType()->getScalarType()->isHalfTy()) { CSF.LowPrecisionPresent = true; break; } @@ -150,11 +151,11 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, } if (!CSF.Int64Ops) - CSF.Int64Ops = I.getType()->isIntegerTy(64); + CSF.Int64Ops = I.getType()->getScalarType()->isIntegerTy(64); if (!CSF.Int64Ops && !isa<LifetimeIntrinsic>(&I)) { for (const Value *Op : I.operands()) { - if (Op->getType()->isIntegerTy(64)) { + if (Op->getType()->getScalarType()->isIntegerTy(64)) { CSF.Int64Ops = true; break; } diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index c2eb24b..c34eecd 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -38,7 +38,6 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsHexagon.h" #include "llvm/IR/Module.h" diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e915a3c4..613cfb5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2385,19 +2385,9 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op, return Res; } -static bool isConstantOrUndef(const SDValue Op) { - if (Op->isUndef()) - return true; - if (isa<ConstantSDNode>(Op)) - return true; - if (isa<ConstantFPSDNode>(Op)) - return true; - return false; -} - -static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) { +static bool isConstantBUILD_VECTOR(const BuildVectorSDNode *Op) { for (unsigned i = 0; i < Op->getNumOperands(); ++i) - if (isConstantOrUndef(Op->getOperand(i))) + if (isIntOrFPConstant(Op->getOperand(i))) return true; return false; } @@ -2505,20 +2495,23 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, if (DAG.isSplatValue(Op, /*AllowUndefs=*/false)) return Op; - if (!isConstantOrUndefBUILD_VECTOR(Node)) { + if (!isConstantBUILD_VECTOR(Node)) { // Use INSERT_VECTOR_ELT operations rather than expand to stores. // The resulting code is the same length as the expansion, but it doesn't // use memory operations. - EVT ResTy = Node->getValueType(0); - assert(ResTy.isVector()); unsigned NumElts = ResTy.getVectorNumElements(); - SDValue Vector = - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0)); + SDValue Op0 = Node->getOperand(0); + SDValue Vector = DAG.getUNDEF(ResTy); + + if (!Op0.isUndef()) + Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0); for (unsigned i = 1; i < NumElts; ++i) { - Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, - Node->getOperand(i), + SDValue Opi = Node->getOperand(i); + if (Opi.isUndef()) + continue; + Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi, DAG.getConstant(i, DL, Subtarget.getGRLenVT())); } return Vector; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index feb4eb3..d9680c7 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -969,7 +969,7 @@ void MipsTargetELFStreamer::finish() { Align Alignment = Section.getAlign(); S.switchSection(&Section); - if (Section.useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(Section)) S.emitCodeAlignment(Alignment, &STI, Alignment.value()); else S.emitValueToAlignment(Alignment, 0, 1, Alignment.value()); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index 614b321..ce9cd12 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -15,8 +15,6 @@ using namespace llvm; -void NVPTXMCAsmInfo::anchor() {} - NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple, const MCTargetOptions &Options) { if (TheTriple.getArch() == Triple::nvptx64) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index 77c4dae..f071406 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -19,8 +19,6 @@ namespace llvm { class Triple; class NVPTXMCAsmInfo : public MCAsmInfo { - virtual void anchor(); - public: explicit NVPTXMCAsmInfo(const Triple &TheTriple, const MCTargetOptions &Options); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp index 9f91143..329e3b5 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp @@ -97,10 +97,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection, if (isDwarfSection(FI, Section)) { // Emit DWARF .file directives in the outermost scope. outputDwarfFileDirectives(); - OS << "\t.section"; - Section->printSwitchToSection(*getStreamer().getContext().getAsmInfo(), - getStreamer().getContext().getTargetTriple(), - OS, SubSection); + OS << "\t.section\t" << Section->getName() << '\n'; // DWARF sections are enclosed into braces - emit the open one. OS << "\t{\n"; HasSections = true; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f2c2f46..ddcecc00 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -952,10 +952,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // promoted to f32. v2f16 is expanded to f16, which is then promoted // to f32. for (const auto &Op : - {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { + {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal); - setOperationAction(Op, MVT::f64, Legal); + // only div/rem/sqrt are legal for f64 + if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) { + setOperationAction(Op, MVT::f64, Legal); + } setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand); setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b5df4c6..442b900 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1234,7 +1234,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>; defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>; defm FMA_F64 : FMA<F64RT, allow_ftz = false>; -// sin/cos +// sin/cos/tanh class UnaryOpAllowsApproxFn<SDPatternOperator operator> : PatFrag<(ops node:$A), @@ -1250,6 +1250,10 @@ def COS_APPROX_f32 : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz), "cos.approx$ftz.f32", [(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>; +def TANH_APPROX_f32 : + BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32", + [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>, + Requires<[hasPTX<70>, hasSM<75>]>; //----------------------------------- // Bitwise operations diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 8baf866..1af2f9c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -220,8 +220,6 @@ bool PPCELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, return evaluateAsRelocatable(Expr, Res, Asm); } -void PPCXCOFFMCAsmInfo::anchor() {} - PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle) report_fatal_error("XCOFF is not supported for little-endian targets"); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h index 0f945b3..6af1bd7 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -33,8 +33,6 @@ public: }; class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF { - void anchor() override; - public: explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &); void printSpecifierExpr(raw_ostream &OS, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 54497d9..3dad0e8 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -213,7 +213,7 @@ public: void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override { if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) { MCSymbolXCOFF *TCSym = - cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly()) + static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly()) ->getQualNameSymbol(); // On AIX, we have TLS variable offsets (symbol@({gd|ie|le|ld}) depending // on the TLS access method (or model). For the general-dynamic access diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index a091b21..ce1d51a 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2274,9 +2274,9 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV, void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) { // Setup CurrentFnDescSym and its containing csect. - MCSectionXCOFF *FnDescSec = - cast<MCSectionXCOFF>(getObjFileLowering().getSectionForFunctionDescriptor( - &MF.getFunction(), TM)); + auto *FnDescSec = static_cast<MCSectionXCOFF *>( + getObjFileLowering().getSectionForFunctionDescriptor(&MF.getFunction(), + TM)); FnDescSec->setAlignment(Align(Subtarget->isPPC64() ? 8 : 4)); CurrentFnDescSym = FnDescSec->getQualNameSymbol(); @@ -2669,9 +2669,9 @@ void PPCAIXAsmPrinter::emitTracebackTable() { MCSymbol *EHInfoSym = TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF); MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym, TOCType_EHBlock); - const MCSymbol *TOCBaseSym = - cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) - ->getQualNameSymbol(); + const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>( + getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); const MCExpr *Exp = MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx), MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx); @@ -2788,7 +2788,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) { } } - MCSectionXCOFF *Csect = cast<MCSectionXCOFF>( + auto *Csect = static_cast<MCSectionXCOFF *>( getObjFileLowering().SectionForGlobal(GV, GVKind, TM)); // Switch to the containing csect. @@ -2869,9 +2869,9 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() { OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext), PointerSize); // Emit TOC base address. - const MCSymbol *TOCBaseSym = - cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) - ->getQualNameSymbol(); + const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>( + getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); OutStreamer->emitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext), PointerSize); // Emit a null environment pointer. @@ -2996,10 +2996,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { Name += Prefix; Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName(); MCSymbol *S = OutContext.getOrCreateSymbol(Name); - TCEntry = cast<MCSectionXCOFF>( + TCEntry = static_cast<MCSectionXCOFF *>( getObjFileLowering().getSectionForTOCEntry(S, TM)); } else { - TCEntry = cast<MCSectionXCOFF>( + TCEntry = static_cast<MCSectionXCOFF *>( getObjFileLowering().getSectionForTOCEntry(I.first.first, TM)); } OutStreamer->switchSection(TCEntry); @@ -3054,7 +3054,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { return; SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM); - MCSectionXCOFF *Csect = cast<MCSectionXCOFF>( + auto *Csect = static_cast<MCSectionXCOFF *>( getObjFileLowering().SectionForGlobal(GO, GOKind, TM)); Align GOAlign = getGVAlignment(GO, GO->getDataLayout()); @@ -3316,9 +3316,9 @@ void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV, GlobalType = TOCType_GlobalExternal; MCSymbol *TypeInfoSym = TM.getSymbol(GV); MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym, GlobalType); - const MCSymbol *TOCBaseSym = - cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) - ->getQualNameSymbol(); + const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>( + getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); auto &Ctx = OutStreamer->getContext(); const MCExpr *Exp = MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx), diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index a143d85..d71c42c 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -3849,9 +3849,14 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, switch (Inst.getOpcode()) { default: break; - case RISCV::PseudoC_ADDI_NOP: - emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP)); + case RISCV::PseudoC_ADDI_NOP: { + if (Inst.getOperand(2).getImm() == 0) + emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP)); + else + emitToStreamer( + Out, MCInstBuilder(RISCV::C_NOP_HINT).addOperand(Inst.getOperand(2))); return false; + } case RISCV::PseudoLLAImm: case RISCV::PseudoLAImm: case RISCV::PseudoLI: { diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index fa7bcfa..5e54b82 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -193,21 +193,19 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo, static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo, uint64_t Address, const MCDisassembler *Decoder) { - if (RegNo == 0) { + if (RegNo == 0) return MCDisassembler::Fail; - } return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); } -static DecodeStatus -DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, uint32_t Address, - const MCDisassembler *Decoder) { - if (RegNo == 2) { +static DecodeStatus DecodeGPRNoX2RegisterClass(MCInst &Inst, uint64_t RegNo, + uint32_t Address, + const MCDisassembler *Decoder) { + if (RegNo == 2) return MCDisassembler::Fail; - } - return DecodeGPRNoX0RegisterClass(Inst, RegNo, Address, Decoder); + return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); } static DecodeStatus DecodeGPRNoX31RegisterClass(MCInst &Inst, uint32_t RegNo, @@ -536,31 +534,6 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address, return MCDisassembler::Success; } -static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus -decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - -static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder); @@ -579,18 +552,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, #include "RISCVGenDisassemblerTables.inc" -static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - uint32_t Rd = fieldFromInstruction(Insn, 7, 5); - if (!Check(S, DecodeGPRNoX0RegisterClass(Inst, Rd, Address, Decoder))) - return MCDisassembler::Fail; - Inst.addOperand(Inst.getOperand(0)); - Inst.addOperand(MCOperand::createImm(0)); - return S; -} - static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -601,66 +562,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, return MCDisassembler::Success; } -static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - Inst.addOperand(MCOperand::createReg(RISCV::X0)); - uint32_t Imm = - fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); - [[maybe_unused]] DecodeStatus Result = - decodeSImmOperand<6>(Inst, Imm, Address, Decoder); - assert(Result == MCDisassembler::Success && "Invalid immediate"); - return MCDisassembler::Success; -} - -static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - Inst.addOperand(MCOperand::createReg(RISCV::X0)); - uint32_t Imm = - fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); - return decodeCLUIImmOperand(Inst, Imm, Address, Decoder); -} - -static DecodeStatus -decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - Inst.addOperand(MCOperand::createReg(RISCV::X0)); - Inst.addOperand(Inst.getOperand(0)); - - uint32_t UImm6 = - fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); - return decodeUImmLog2XLenNonZeroOperand(Inst, UImm6, Address, Decoder); -} - -static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - uint32_t Rd = fieldFromInstruction(Insn, 7, 5); - uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5); - if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder))) - return MCDisassembler::Fail; - if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder))) - return MCDisassembler::Fail; - return S; -} - -static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - uint32_t Rd = fieldFromInstruction(Insn, 7, 5); - uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5); - if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder))) - return MCDisassembler::Fail; - Inst.addOperand(Inst.getOperand(0)); - if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder))) - return MCDisassembler::Fail; - return S; -} - static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 7ad5d5f..bddea43 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -330,7 +330,6 @@ enum OperandType : unsigned { OPERAND_UIMM32, OPERAND_UIMM48, OPERAND_UIMM64, - OPERAND_ZERO, OPERAND_THREE, OPERAND_FOUR, OPERAND_SIMM5, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index baa508a..269b117 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -13,13 +13,7 @@ #include "MCTargetDesc/RISCVAsmBackend.h" #include "MCTargetDesc/RISCVMCAsmInfo.h" -#include "RISCVFixupKinds.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index d4f5d8f..2f32e2a 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -293,7 +293,7 @@ void RISCVAsmPrinter::emitNTLHint(const MachineInstr *MI) { MCInst Hint; if (STI->hasStdExtZca()) - Hint.setOpcode(RISCV::C_ADD_HINT); + Hint.setOpcode(RISCV::C_ADD); else Hint.setOpcode(RISCV::ADD); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index f9c0b54..171940e 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1272,7 +1272,7 @@ def FeatureVendorXSfmm128t def FeatureVendorXSfvqmaccdod : RISCVExtension<1, 0, "SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)", - [FeatureStdExtZve32x]>; + [FeatureStdExtZve32x, FeatureStdExtZvl128b]>; def HasVendorXSfvqmaccdod : Predicate<"Subtarget->hasVendorXSfvqmaccdod()">, AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod), @@ -1281,7 +1281,7 @@ def HasVendorXSfvqmaccdod def FeatureVendorXSfvqmaccqoq : RISCVExtension<1, 0, "SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)", - [FeatureStdExtZve32x]>; + [FeatureStdExtZve32x, FeatureStdExtZvl256b]>; def HasVendorXSfvqmaccqoq : Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">, AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq), @@ -1290,7 +1290,7 @@ def HasVendorXSfvqmaccqoq def FeatureVendorXSfvfwmaccqqq : RISCVExtension<1, 0, "SiFive Matrix Multiply Accumulate Instruction (4-by-4)", - [FeatureStdExtZvfbfmin]>; + [FeatureStdExtZvfbfmin, FeatureStdExtZvl128b]>; def HasVendorXSfvfwmaccqqq : Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">, AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq), diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 3918dd2..54845e5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1618,6 +1618,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } } + // Customize load and store operation for bf16 if zfh isn't enabled. + if (Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh()) { + setOperationAction(ISD::LOAD, MVT::bf16, Custom); + setOperationAction(ISD::STORE, MVT::bf16, Custom); + } + // Function alignments. const Align FunctionAlignment(Subtarget.hasStdExtZca() ? 2 : 4); setMinFunctionAlignment(FunctionAlignment); @@ -7216,6 +7222,47 @@ static SDValue SplitStrictFPVectorOp(SDValue Op, SelectionDAG &DAG) { return DAG.getMergeValues({V, HiRes.getValue(1)}, DL); } +SDValue +RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Load(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() && + "Unexpected bfloat16 load lowering"); + + SDLoc DL(Op); + LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); + EVT MemVT = LD->getMemoryVT(); + SDValue Load = DAG.getExtLoad( + ISD::ZEXTLOAD, DL, Subtarget.getXLenVT(), LD->getChain(), + LD->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + LD->getMemOperand()); + // Using mask to make bf16 nan-boxing valid when we don't have flh + // instruction. -65536 would be treat as a small number and thus it can be + // directly used lui to get the constant. + SDValue mask = DAG.getSignedConstant(-65536, DL, Subtarget.getXLenVT()); + SDValue OrSixteenOne = + DAG.getNode(ISD::OR, DL, Load.getValueType(), {Load, mask}); + SDValue ConvertedResult = + DAG.getNode(RISCVISD::NDS_FMV_BF16_X, DL, MVT::bf16, OrSixteenOne); + return DAG.getMergeValues({ConvertedResult, Load.getValue(1)}, DL); +} + +SDValue +RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Store(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasVendorXAndesBFHCvt() && !Subtarget.hasStdExtZfh() && + "Unexpected bfloat16 store lowering"); + + StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); + SDLoc DL(Op); + SDValue FMV = DAG.getNode(RISCVISD::NDS_FMV_X_ANYEXTBF16, DL, + Subtarget.getXLenVT(), ST->getValue()); + return DAG.getTruncStore( + ST->getChain(), DL, FMV, ST->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), ST->getMemoryVT().getSizeInBits()), + ST->getMemOperand()); +} + SDValue RISCVTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -7914,6 +7961,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getMergeValues({Pair, Chain}, DL); } + if (VT == MVT::bf16) + return lowerXAndesBfHCvtBFloat16Load(Op, DAG); + // Handle normal vector tuple load. if (VT.isRISCVVectorTuple()) { SDLoc DL(Op); @@ -7998,6 +8048,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, {Store->getChain(), Lo, Hi, Store->getBasePtr()}, MVT::i64, Store->getMemOperand()); } + + if (VT == MVT::bf16) + return lowerXAndesBfHCvtBFloat16Store(Op, DAG); + // Handle normal vector tuple store. if (VT.isRISCVVectorTuple()) { SDLoc DL(Op); @@ -16079,7 +16133,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt = CNode->getZExtValue(); // Don't do this if the Xqciac extension is enabled and the MulAmt in simm12. - if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt)) + if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); const bool HasShlAdd = Subtarget.hasStdExtZba() || @@ -16184,10 +16238,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x)) for (uint64_t Offset : {3, 5, 9}) { if (isPowerOf2_64(MulAmt + Offset)) { + unsigned ShAmt = Log2_64(MulAmt + Offset); + if (ShAmt >= VT.getSizeInBits()) + continue; SDLoc DL(N); SDValue Shift1 = - DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT)); + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT)); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index f0447e0..ca70c46 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -578,6 +578,9 @@ private: SDValue lowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerXAndesBfHCvtBFloat16Load(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerXAndesBfHCvtBFloat16Store(SDValue Op, SelectionDAG &DAG) const; + bool isEligibleForTailCallOptimization( CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, const SmallVector<CCValAssign, 16> &ArgLocs) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 64f9e3e..085064e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2859,9 +2859,6 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM16_NONZERO: Ok = isUInt<16>(Imm) && (Imm != 0); break; - case RISCVOp::OPERAND_ZERO: - Ok = Imm == 0; - break; case RISCVOp::OPERAND_THREE: Ok = Imm == 3; break; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 8252a9b..c5551fb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -57,12 +57,6 @@ def simm6nonzero : RISCVOp, }]; } -def immzero : RISCVOp, - ImmLeaf<XLenVT, [{return (Imm == 0);}]> { - let ParserMatchClass = ImmZeroAsmOperand; - let OperandType = "OPERAND_ZERO"; -} - def CLUIImmAsmOperand : AsmOperandClass { let Name = "CLUIImm"; let RenderMethod = "addImmOperands"; @@ -272,7 +266,7 @@ class Bcz<bits<3> funct3, string OpcodeStr> let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class Shift_right<bits<2> funct2, string OpcodeStr> : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), - (ins GPRC:$rs1, uimmlog2xlennonzero:$imm), + (ins GPRC:$rs1, uimmlog2xlen:$imm), OpcodeStr, "$rs1, $imm"> { let Constraints = "$rs1 = $rd"; let Inst{12} = imm{5}; @@ -402,17 +396,19 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">, let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), - (ins GPRNoX0:$rd, simm6nonzero:$imm), + (ins GPRNoX0:$rd, simm6:$imm), "c.addi", "$rd, $imm">, Sched<[WriteIALU, ReadIALU]> { let Constraints = "$rd = $rd_wb"; } -// Alternate syntax for c.nop. Converted to C_NOP by the assembler. +// Alternate syntax for c.nop. Converted to C_NOP/C_NOP_HINT by the assembler. let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, isAsmParserOnly = 1 in -def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm), - [], "c.addi", "$rd, $imm">; +def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, simm6:$imm), + [], "c.addi", "$rd, $imm"> { + let Constraints = "$rs1 = $rd"; +} let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1, DecoderNamespace = "RV32Only", Defs = [X1], @@ -430,7 +426,7 @@ def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb), } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm), +def C_LI : RVInst16CI<0b010, 0b01, (outs GPR:$rd), (ins simm6:$imm), "c.li", "$rd, $imm">, Sched<[WriteIALU]>; @@ -449,7 +445,7 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb), } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd), +def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX2:$rd), (ins c_lui_imm:$imm), "c.lui", "$rd, $imm">, Sched<[WriteIALU]>; @@ -497,8 +493,8 @@ def C_BEQZ : Bcz<0b110, "c.beqz">, Sched<[WriteJmp, ReadJmp]>; def C_BNEZ : Bcz<0b111, "c.bnez">, Sched<[WriteJmp, ReadJmp]>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb), - (ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm), +def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), + (ins GPR:$rd, uimmlog2xlen:$imm), "c.slli", "$rd, $imm">, Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; @@ -544,7 +540,7 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1), let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1, isAsCheapAsAMove = 1 in -def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2), +def C_MV : RVInst16CR<0b1000, 0b10, (outs GPR:$rs1), (ins GPRNoX0:$rs2), "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]>; @@ -557,8 +553,8 @@ def C_JALR : RVInst16CR<0b1001, 0b10, (outs), (ins GPRNoX0:$rs1), "c.jalr", "$rs1">, Sched<[WriteJalr, ReadJalr]>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rd), - (ins GPRNoX0:$rs1, GPRNoX0:$rs2), +def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd), + (ins GPR:$rs1, GPRNoX0:$rs2), "c.add", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU, ReadIALU]> { let Constraints = "$rs1 = $rd"; @@ -616,81 +612,6 @@ def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm), let rd = 0; } -def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb), - (ins GPRNoX0:$rd, immzero:$imm), - "c.addi", "$rd, $imm">, - Sched<[WriteIALU, ReadIALU]> { - let Constraints = "$rd = $rd_wb"; - let imm = 0; - let DecoderMethod = "decodeRVCInstrRdRs1ImmZero"; -} - -def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm), - "c.li", "$rd, $imm">, - Sched<[WriteIALU]> { - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdSImm6"; -} - -def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd), - (ins c_lui_imm:$imm), - "c.lui", "$rd, $imm">, - Sched<[WriteIALU]> { - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdCLUIImm"; -} - -def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2), - "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]> { - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdRs2"; -} - -def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rd), - (ins GPRX0:$rs1, GPRNoX0:$rs2), - "c.add", "$rs1, $rs2">, - Sched<[WriteIALU, ReadIALU, ReadIALU]> { - let Constraints = "$rs1 = $rd"; - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdRs1Rs2"; -} - -def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb), - (ins GPRX0:$rd, uimmlog2xlennonzero:$imm), - "c.slli", "$rd, $imm">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rd = $rd_wb"; - let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdRs1UImmLog2XLenNonZero"; -} - -def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd), - "c.slli64", "$rd">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rd = $rd_wb"; - let imm = 0; -} - -def C_SRLI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), - (ins GPRC:$rs1), - "c.srli64", "$rs1">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rs1 = $rd"; - let Inst{6-2} = 0; - let Inst{11-10} = 0b00; - let Inst{12} = 0; -} - -def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), - (ins GPRC:$rs1), - "c.srai64", "$rs1">, - Sched<[WriteShiftImm, ReadShiftImm]> { - let Constraints = "$rs1 = $rd"; - let Inst{6-2} = 0; - let Inst{11-10} = 0b01; - let Inst{12} = 0; -} - } // Predicates = [HasStdExtZca], hasSideEffects = 0, mayLoad = 0, // mayStore = 0 @@ -699,15 +620,17 @@ def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd), //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZca] in { -// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6. -def : InstAlias<"c.addi x0, $imm", (C_NOP_HINT simm6nonzero:$imm), 0>; +// Legacy aliases. +def : InstAlias<"c.slli64 $rd", (C_SLLI GPR:$rd, 0), 0>; +def : InstAlias<"c.srli64 $rs1", (C_SRLI GPRC:$rs1, 0), 0>; +def : InstAlias<"c.srai64 $rs1", (C_SRAI GPRC:$rs1, 0), 0>; } let Predicates = [HasStdExtC, HasStdExtZihintntl] in { -def : InstAlias<"c.ntl.p1", (C_ADD_HINT X0, X2)>; -def : InstAlias<"c.ntl.pall", (C_ADD_HINT X0, X3)>; -def : InstAlias<"c.ntl.s1", (C_ADD_HINT X0, X4)>; -def : InstAlias<"c.ntl.all", (C_ADD_HINT X0, X5)>; +def : InstAlias<"c.ntl.p1", (C_ADD X0, X2)>; +def : InstAlias<"c.ntl.pall", (C_ADD X0, X3)>; +def : InstAlias<"c.ntl.s1", (C_ADD X0, X4)>; +def : InstAlias<"c.ntl.all", (C_ADD X0, X5)>; } // Predicates = [HasStdExtC, HasStdExtZihintntl] let EmitPriority = 0 in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 7f1077a..dd365cf 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -89,39 +89,41 @@ class PLI_B_i<bits<8> funct8, string opcodestr> } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr> - : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> { +class RVPShift_ri<bits<3> f, bits<3> funct3, string opcodestr, Operand ImmType> + : RVInstIBase<funct3, OPC_OP_IMM_32, (outs GPR:$rd), + (ins GPR:$rs1, ImmType:$shamt), opcodestr, + "$rd, $rs1, $shamt"> { let Inst{31} = 0b1; let Inst{30-28} = f; let Inst{27} = 0b0; } -class RVPUnaryImm5<bits<3> f, string opcodestr> - : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> { - bits<5> uimm5; +class RVPShiftW_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm5> { + bits<5> shamt; let Inst{26-25} = 0b01; - let Inst{24-20} = uimm5; + let Inst{24-20} = shamt; } -class RVPUnaryImm4<bits<3> f, string opcodestr> - : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm4:$uimm4), "$rd, $rs1, $uimm4"> { - bits<4> uimm4; +class RVPShiftH_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm4> { + bits<4> shamt; let Inst{26-24} = 0b001; - let Inst{23-20} = uimm4; + let Inst{23-20} = shamt; } -class RVPUnaryImm3<bits<3> f, string opcodestr> - : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm3:$uimm3), "$rd, $rs1, $uimm3"> { - bits<3> uimm3; +class RVPShiftB_ri<bits<3> f, bits<3> funct3, string opcodestr> + : RVPShift_ri<f, funct3, opcodestr, uimm3> { + bits<3> shamt; let Inst{26-23} = 0b0001; - let Inst{22-20} = uimm3; + let Inst{22-20} = shamt; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class RVPUnaryWUF<bits<2> w, bits<5> uf, string opcodestr> +class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr> : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins GPR:$rs1), opcodestr, "$rd, $rs1"> { let Inst{31-27} = 0b11100; @@ -149,16 +151,16 @@ def ABSW : UnaryW_r<0b011000000111, 0b001, "absw">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in { -def PSLLI_B : RVPUnaryImm3<0b000, "pslli.b">; -def PSLLI_H : RVPUnaryImm4<0b000, "pslli.h">; -def PSSLAI_H : RVPUnaryImm4<0b101, "psslai.h">; +def PSLLI_B : RVPShiftB_ri<0b000, 0b010, "pslli.b">; +def PSLLI_H : RVPShiftH_ri<0b000, 0b010, "pslli.h">; +def PSSLAI_H : RVPShiftH_ri<0b101, 0b010, "psslai.h">; } // Predicates = [HasStdExtP] let DecoderNamespace = "RV32Only", Predicates = [HasStdExtP, IsRV32] in -def SSLAI : RVPUnaryImm5<0b101, "sslai">; +def SSLAI : RVPShiftW_ri<0b101, 0b010, "sslai">; let Predicates = [HasStdExtP, IsRV64] in { -def PSLLI_W : RVPUnaryImm5<0b000, "pslli.w">; -def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">; +def PSLLI_W : RVPShiftW_ri<0b000, 0b010, "pslli.w">; +def PSSLAI_W : RVPShiftW_ri<0b101, 0b010, "psslai.w">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in @@ -169,13 +171,13 @@ let Predicates = [HasStdExtP] in def PLI_B : PLI_B_i<0b10110100, "pli.b">; let Predicates = [HasStdExtP] in { -def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">; -def PSABS_H : RVPUnaryWUF<0b00, 0b00111, "psabs.h">; -def PSABS_B : RVPUnaryWUF<0b10, 0b00111, "psabs.b">; +def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">; +def PSABS_H : RVPUnary_ri<0b00, 0b00111, "psabs.h">; +def PSABS_B : RVPUnary_ri<0b10, 0b00111, "psabs.b">; } // Predicates = [HasStdExtP] let Predicates = [HasStdExtP, IsRV64] in { -def PSEXT_W_B : RVPUnaryWUF<0b01, 0b00100, "psext.w.b">; -def PSEXT_W_H : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">; +def PSEXT_W_B : RVPUnary_ri<0b01, 0b00100, "psext.w.b">; +def PSEXT_W_H : RVPUnary_ri<0b01, 0b00101, "psext.w.h">; } // Predicates = [HasStdExtP, IsRV64] let Predicates = [HasStdExtP] in diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index dfa532a..6afc942d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -788,7 +788,7 @@ class VPseudoUSLoadNoMask<VReg RetClass, DAGOperand sewop = sew> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, - sewop:$sew, vec_policy:$policy), []>, + sewop:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -804,7 +804,7 @@ class VPseudoUSLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -821,7 +821,7 @@ class VPseudoUSLoadFFNoMask<VReg RetClass, int EEW> : RISCVVPseudo<(outs RetClass:$rd, GPR:$vl), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -837,7 +837,7 @@ class VPseudoUSLoadFFMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -854,7 +854,7 @@ class VPseudoSLoadNoMask<VReg RetClass, int EEW> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, GPR:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -870,7 +870,7 @@ class VPseudoSLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, GPR:$rs2, VMaskOp:$vm, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -892,7 +892,7 @@ class VPseudoILoadNoMask<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, IdxClass:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -914,7 +914,7 @@ class VPseudoILoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$rs2, VMaskOp:$vm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -933,7 +933,7 @@ class VPseudoUSStoreNoMask<VReg StClass, DAGOperand sewop = sew> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, - sewop:$sew), []>, + sewop:$sew)>, RISCVVSE</*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -946,7 +946,7 @@ class VPseudoUSStoreMask<VReg StClass, int EEW> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSE</*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -960,7 +960,7 @@ class VPseudoSStoreNoMask<VReg StClass, int EEW> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, - AVL:$vl, sew:$sew), []>, + AVL:$vl, sew:$sew)>, RISCVVSE</*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -973,7 +973,7 @@ class VPseudoSStoreMask<VReg StClass, int EEW> : RISCVVPseudo<(outs), (ins StClass:$rd, GPRMemZeroOffset:$rs1, GPR:$rs2, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSE</*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -986,7 +986,7 @@ class VPseudoSStoreMask<VReg StClass, class VPseudoNullaryNoMask<VReg RegClass> : RISCVVPseudo<(outs RegClass:$rd), (ins RegClass:$passthru, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1015,7 +1015,7 @@ class VPseudoNullaryMask<VReg RegClass> : // Nullary for pseudo instructions. They are expanded in // RISCVExpandPseudoInsts pass. class VPseudoNullaryPseudoM<string BaseInst> : - RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew), []> { + RISCVVPseudo<(outs VR:$rd), (ins AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1031,7 +1031,7 @@ class VPseudoUnaryNoMask<DAGOperand RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, OpClass:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1047,7 +1047,7 @@ class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass, string Constraint = "", bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), - (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew), []> { + (ins OpClass:$rs2, AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1063,7 +1063,7 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1084,7 +1084,7 @@ class VPseudoUnaryMask<VReg RetClass, DAGOperand sewop = sew> : RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, - VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy), []> { + VMaskOp:$vm, AVL:$vl, sewop:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1104,7 +1104,7 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2, VMaskOp:$vm, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1139,7 +1139,7 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass, class VPseudoUnaryNoMaskGPROut : RISCVVPseudo<(outs GPR:$rd), - (ins VR:$rs2, AVL:$vl, sew_mask:$sew), []> { + (ins VR:$rs2, AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1149,7 +1149,7 @@ class VPseudoUnaryNoMaskGPROut : class VPseudoUnaryMaskGPROut : RISCVVPseudo<(outs GPR:$rd), - (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew), []> { + (ins VR:$rs1, VMaskOp:$vm, AVL:$vl, sew_mask:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1163,7 +1163,7 @@ class VPseudoUnaryAnyMask<VReg RetClass, VReg Op1Class> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, - VR:$vm, AVL:$vl, sew:$sew), []> { + VR:$vm, AVL:$vl, sew:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1197,7 +1197,7 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1241,7 +1241,7 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass, (ins GetVRegNoV0<RetClass>.R:$passthru, Op1Class:$rs2, Op2Class:$rs1, VMaskOp:$vm, vec_rm:$rm, AVL:$vl, - sew:$sew, vec_policy:$policy), []> { + sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1266,7 +1266,7 @@ class VPseudoTiedBinaryNoMask<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew, - vec_policy:$policy), []> { + vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1288,7 +1288,7 @@ class VPseudoTiedBinaryNoMaskRoundingMode<VReg RetClass, (ins RetClass:$rs2, Op2Class:$rs1, vec_rm:$rm, AVL:$vl, sew:$sew, - vec_policy:$policy), []> { + vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1380,7 +1380,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass, Op1Class:$rs2, Op2Class:$rs1, VMaskOp:$vm, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1451,7 +1451,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass, Op2Class:$rs1, VMaskOp:$vm, vec_rm:$rm, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1480,7 +1480,7 @@ class VPseudoBinaryCarry<VReg RetClass, (ins Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, AVL:$vl, sew:$sew), (ins Op1Class:$rs2, Op2Class:$rs1, - AVL:$vl, sew:$sew)), []> { + AVL:$vl, sew:$sew))> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1498,7 +1498,7 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, - VMV0:$carry, AVL:$vl, sew:$sew), []> { + VMV0:$carry, AVL:$vl, sew:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1516,7 +1516,7 @@ class VPseudoTernaryNoMask<VReg RetClass, string Constraint> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - AVL:$vl, sew:$sew), []> { + AVL:$vl, sew:$sew)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1532,7 +1532,7 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass, bits<2> TargetConstraintType = 1> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - AVL:$vl, sew:$sew, vec_policy:$policy), []> { + AVL:$vl, sew:$sew, vec_policy:$policy)> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -1570,7 +1570,7 @@ class VPseudoUSSegLoadNoMask<VReg RetClass, bits<4> NF> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$vl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1587,7 +1587,7 @@ class VPseudoUSSegLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1605,7 +1605,7 @@ class VPseudoUSSegLoadFFNoMask<VReg RetClass, bits<4> NF> : RISCVVPseudo<(outs RetClass:$rd, GPR:$vl), (ins RetClass:$dest, GPRMemZeroOffset:$rs1, AVL:$avl, - sew:$sew, vec_policy:$policy), []>, + sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1622,7 +1622,7 @@ class VPseudoUSSegLoadFFMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, VMaskOp:$vm, AVL:$avl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1640,7 +1640,7 @@ class VPseudoSSegLoadNoMask<VReg RetClass, bits<4> NF> : RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1657,7 +1657,7 @@ class VPseudoSSegLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, GPR:$offset, VMaskOp:$vm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLSEG<NF, /*Masked*/1, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> { let mayLoad = 1; let mayStore = 0; @@ -1679,7 +1679,7 @@ class VPseudoISegLoadNoMask<VReg RetClass, RISCVVPseudo<(outs RetClass:$rd), (ins RetClass:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$offset, AVL:$vl, sew:$sew, - vec_policy:$policy), []>, + vec_policy:$policy)>, RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -1701,7 +1701,7 @@ class VPseudoISegLoadMask<VReg RetClass, RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd), (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMemZeroOffset:$rs1, IdxClass:$offset, VMaskOp:$vm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, + AVL:$vl, sew:$sew, vec_policy:$policy)>, RISCVVLXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 1; let mayStore = 0; @@ -1735,7 +1735,7 @@ class VPseudoUSSegStoreMask<VReg ValClass, bits<4> NF> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSSEG<NF, /*Masked*/1, /*Strided*/0, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1750,7 +1750,7 @@ class VPseudoSSegStoreNoMask<VReg ValClass, bits<4> NF> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR:$offset, - AVL:$vl, sew:$sew), []>, + AVL:$vl, sew:$sew)>, RISCVVSSEG<NF, /*Masked*/0, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1764,7 +1764,7 @@ class VPseudoSSegStoreMask<VReg ValClass, bits<4> NF> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, GPR: $offset, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSSEG<NF, /*Masked*/1, /*Strided*/1, !logtwo(EEW), VLMul> { let mayLoad = 0; let mayStore = 1; @@ -1782,7 +1782,7 @@ class VPseudoISegStoreNoMask<VReg ValClass, bit Ordered> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, - AVL:$vl, sew:$sew), []>, + AVL:$vl, sew:$sew)>, RISCVVSXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -1799,7 +1799,7 @@ class VPseudoISegStoreMask<VReg ValClass, bit Ordered> : RISCVVPseudo<(outs), (ins ValClass:$rd, GPRMemZeroOffset:$rs1, IdxClass: $index, - VMaskOp:$vm, AVL:$vl, sew:$sew), []>, + VMaskOp:$vm, AVL:$vl, sew:$sew)>, RISCVVSXSEG<NF, /*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> { let mayLoad = 0; let mayStore = 1; @@ -6703,7 +6703,7 @@ let Predicates = [HasVInstructions] in { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { let HasSEWOp = 1, BaseInstr = VMV_X_S in def PseudoVMV_X_S: - RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew), []>, + RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>, Sched<[WriteVMovXS, ReadVMovXS]>; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1, Constraints = "$rd = $passthru" in @@ -6723,8 +6723,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { foreach f = FPList in { let HasSEWOp = 1, BaseInstr = VFMV_F_S in def "PseudoVFMV_" # f.FX # "_S" : - RISCVVPseudo<(outs f.fprclass:$rd), - (ins VR:$rs2, sew:$sew), []>, + RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>, Sched<[WriteVMovFS, ReadVMovFS]>; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1, Constraints = "$rd = $passthru" in diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index 1bb67f4..c75addd9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -11,6 +11,20 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// RISC-V specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDT_NDS_FMV_BF16_X + : SDTypeProfile<1, 1, [SDTCisVT<0, bf16>, SDTCisVT<1, XLenVT>]>; +def SDT_NDS_FMV_X_ANYEXTBF16 + : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisVT<1, bf16>]>; + +def riscv_nds_fmv_bf16_x + : SDNode<"RISCVISD::NDS_FMV_BF16_X", SDT_NDS_FMV_BF16_X>; +def riscv_nds_fmv_x_anyextbf16 + : SDNode<"RISCVISD::NDS_FMV_X_ANYEXTBF16", SDT_NDS_FMV_X_ANYEXTBF16>; + +//===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// @@ -773,6 +787,25 @@ def : Pat<(bf16 (fpround FPR32:$rs)), (NDS_FCVT_BF16_S FPR32:$rs)>; } // Predicates = [HasVendorXAndesBFHCvt] +let isCodeGenOnly = 1 in { +def NDS_FMV_BF16_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR16, GPR, "fmv.w.x">, + Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>; +def NDS_FMV_X_BF16 : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR16, "fmv.x.w">, + Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>; +} + +let Predicates = [HasVendorXAndesBFHCvt] in { +def : Pat<(riscv_nds_fmv_bf16_x GPR:$src), (NDS_FMV_BF16_X GPR:$src)>; +def : Pat<(riscv_nds_fmv_x_anyextbf16 (bf16 FPR16:$src)), + (NDS_FMV_X_BF16 (bf16 FPR16:$src))>; +} // Predicates = [HasVendorXAndesBFHCvt] + +// Use flh/fsh to load/store bf16 if zfh is enabled. +let Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt] in { +def : LdPat<load, FLH, bf16>; +def : StPat<store, FSH, FPR16, bf16>; +} // Predicates = [HasStdExtZfh, HasVendorXAndesBFHCvt] + let Predicates = [HasVendorXAndesVBFHCvt] in { defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16; defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td index f173440..ed1a60a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td @@ -291,31 +291,31 @@ def : CompressPat<(MUL GPRC:$rs1, GPRC:$rs2, GPRC:$rs1), let Predicates = [HasStdExtZcb, HasStdExtZbb] in{ def : CompressPat<(SEXT_B GPRC:$rs1, GPRC:$rs1), - (C_SEXT_B GPRC:$rs1, GPRC:$rs1)>; + (C_SEXT_B GPRC:$rs1)>; def : CompressPat<(SEXT_H GPRC:$rs1, GPRC:$rs1), - (C_SEXT_H GPRC:$rs1, GPRC:$rs1)>; + (C_SEXT_H GPRC:$rs1)>; } // Predicates = [HasStdExtZcb, HasStdExtZbb] let Predicates = [HasStdExtZcb, HasStdExtZbb] in{ def : CompressPat<(ZEXT_H_RV32 GPRC:$rs1, GPRC:$rs1), - (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_H GPRC:$rs1)>; def : CompressPat<(ZEXT_H_RV64 GPRC:$rs1, GPRC:$rs1), - (C_ZEXT_H GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_H GPRC:$rs1)>; } // Predicates = [HasStdExtZcb, HasStdExtZbb] let Predicates = [HasStdExtZcb] in{ def : CompressPat<(ANDI GPRC:$rs1, GPRC:$rs1, 255), - (C_ZEXT_B GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_B GPRC:$rs1)>; } // Predicates = [HasStdExtZcb] let Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64] in{ def : CompressPat<(ADD_UW GPRC:$rs1, GPRC:$rs1, X0), - (C_ZEXT_W GPRC:$rs1, GPRC:$rs1)>; + (C_ZEXT_W GPRC:$rs1)>; } // Predicates = [HasStdExtZcb, HasStdExtZba, IsRV64] let Predicates = [HasStdExtZcb] in{ def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1), - (C_NOT GPRC:$rs1, GPRC:$rs1)>; + (C_NOT GPRC:$rs1)>; } let Predicates = [HasStdExtZcb] in{ diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 0565fcd..30d8f85 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -224,10 +224,10 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // Note: Same VL as above, but i32 not xlen due to signature of - // vp.strided.load - VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); + // For rv64, need to truncate i64 to i32 to match signature. As VL is at most + // the number of active lanes (which is bounded by i32) this is safe. + VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); + CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, @@ -302,10 +302,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); - // Note: Same VL as above, but i32 not xlen due to signature of - // vp.strided.store - VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); + // For rv64, need to truncate i64 to i32 to match signature. As VL is at + // most the number of active lanes (which is bounded by i32) this is safe. + VL = Builder.CreateTrunc(VL, Builder.getInt32Ty()); CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index e87f452..ccb39e8 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -268,6 +268,11 @@ def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)> { let DiagnosticString = "register must be a GPR excluding zero (x0)"; } +def GPRNoX2 : GPRRegisterClass<(sub GPR, X2)> { + let DiagnosticType = "InvalidRegClassGPRNoX2"; + let DiagnosticString = "register must be a GPR excluding sp (x2)"; +} + def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)> { let DiagnosticType = "InvalidRegClassGPRNoX0X2"; let DiagnosticString = "register must be a GPR excluding zero (x0) and sp (x2)"; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 3e286a7..bf23812 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0 bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); } +defvar SMX60VLEN = 256; +defvar SMX60DLEN = !div(SMX60VLEN, 2); + +class Get1248Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 2, + !eq(mx, "M4") : 4, + !eq(mx, "M8") : 8, + true: 1 + ); +} + +// Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides +class Get4816Latency<string mx> { + int c = !cond( + !eq(mx, "M4") : 8, + !eq(mx, "M8") : 16, + true: 4 + ); +} + +// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max +class Get458Latency<string mx> { + int c = !cond( + !eq(mx, "M4") : 5, + !eq(mx, "M8") : 8, + true: 4 + ); +} + +// Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs +// Used for: widening operations +class Get4588Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 5, + !eq(mx, "M4") : 8, + !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback + true: 4 + ); +} + +// Used for: mask-producing comparisons, carry ops with mask, FP comparisons +class Get461018Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 6, + !eq(mx, "M4") : 10, + !eq(mx, "M8") : 18, + true: 4 + ); +} + +// Used for: e64 multiply pattern, complex ops +class Get781632Latency<string mx> { + int c = !cond( + !eq(mx, "M2") : 8, + !eq(mx, "M4") : 16, + !eq(mx, "M8") : 32, + true: 7 + ); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -322,58 +383,96 @@ foreach LMul = [1, 2, 4, 8] in { foreach mx = SchedMxList in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; - defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; - - defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; - - defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; + } + + let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in { + // Pattern of vadd, vsub, vrsub: 4/4/5/8 + // Pattern of vand, vor, vxor: 4/4/8/16 + // They are grouped together, so we used the worst case 4/4/8/16 + // TODO: use InstRW to override individual instructions' scheduling data + defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; + } + + let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; + } + + // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8, + // e64 = 7,8,16,32. We use the worst-case until we can split the SEW. + // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites + let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in { + defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + } } // Widening +// Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8 +// We use the worst-case for all. foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; + } } -// Vector Integer Division and Remainder +// Division and remainder operations +// Pattern of vdivu: 11/11/11/20/40/80/160 +// Pattern of vdiv: 12/12/12/22/44/88/176 +// Pattern of vremu: 12/12/12/22/44/88/176 +// Pattern of vrem: 13/13/13/24/48/96/192 +// We use for all: 12/12/12/24/48/96/192 +// TODO: Create separate WriteVIRem to more closely match the latencies foreach mx = SchedMxList in { foreach sew = SchedSEWSet<mx>.val in { defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; - defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; - defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + // Slightly reduced for fractional LMULs + defvar Multiplier = !cond( + !eq(mx, "MF8") : 12, + !eq(mx, "MF4") : 12, + !eq(mx, "MF2") : 12, + true: 24 + ); + + let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in { + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + } } } @@ -381,12 +480,21 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListW in { defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; - defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; + // Slightly increased for integer LMULs + defvar Multiplier = !cond( + !eq(mx, "M2") : 2, + !eq(mx, "M4") : 2, + true: 1 + ); + + let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in { + defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; + } } // 12. Vector Fixed-Point Arithmetic Instructions diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index b43b915..da6ac2f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -104,11 +104,6 @@ static cl::opt<bool> EnablePostMISchedLoadStoreClustering( cl::desc("Enable PostRA load and store clustering in the machine scheduler"), cl::init(true)); -static cl::opt<bool> - EnableVLOptimizer("riscv-enable-vl-optimizer", - cl::desc("Enable the RISC-V VL Optimizer pass"), - cl::init(true), cl::Hidden); - static cl::opt<bool> DisableVectorMaskMutation( "riscv-disable-vector-mask-mutation", cl::desc("Disable the vector mask scheduling mutation"), cl::init(false), @@ -617,8 +612,7 @@ void RISCVPassConfig::addPreRegAlloc() { addPass(createRISCVPreRAExpandPseudoPass()); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createRISCVMergeBaseOffsetOptPass()); - if (EnableVLOptimizer) - addPass(createRISCVVLOptimizerPass()); + addPass(createRISCVVLOptimizerPass()); } addPass(createRISCVInsertReadWriteCSRPass()); diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index b53d919..c946451 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -114,14 +114,6 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() { return new RISCVVLOptimizer(); } -/// Return true if R is a physical or virtual vector register, false otherwise. -static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) { - if (R.isPhysical()) - return RISCV::VRRegClass.contains(R); - const TargetRegisterClass *RC = MRI->getRegClass(R); - return RISCVRI::isVRegClass(RC->TSFlags); -} - LLVM_ATTRIBUTE_UNUSED static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) { OI.print(OS); @@ -183,37 +175,28 @@ static unsigned getIntegerExtensionOperandEEW(unsigned Factor, return Log2EEW; } -/// Check whether MO is a mask operand of MI. -static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO, - const MachineRegisterInfo *MRI) { - - if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI)) - return false; - - const MCInstrDesc &Desc = MI.getDesc(); - return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID; -} - static std::optional<unsigned> getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { const MachineInstr &MI = *MO.getParent(); + const MCInstrDesc &Desc = MI.getDesc(); const RISCVVPseudosTable::PseudoInfo *RVV = RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()); assert(RVV && "Could not find MI in PseudoTable"); // MI has a SEW associated with it. The RVV specification defines // the EEW of each operand and definition in relation to MI.SEW. - unsigned MILog2SEW = - MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm(); + unsigned MILog2SEW = MI.getOperand(RISCVII::getSEWOpNum(Desc)).getImm(); - const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc()); - const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags); + const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc); + const bool IsTied = RISCVII::isTiedPseudo(Desc.TSFlags); bool IsMODef = MO.getOperandNo() == 0 || (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs()); // All mask operands have EEW=1 - if (isMaskOperand(MI, MO, MRI)) + const MCOperandInfo &Info = Desc.operands()[MO.getOperandNo()]; + if (Info.OperandType == MCOI::OPERAND_REGISTER && + Info.RegClass == RISCV::VMV0RegClassID) return 0; // switch against BaseInstr to reduce number of cases that need to be @@ -1296,8 +1279,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) && "Instruction shouldn't be supported if elements depend on VL"); - assert(MI.getOperand(0).isReg() && - isVectorRegClass(MI.getOperand(0).getReg(), MRI) && + assert(RISCVRI::isVRegClass( + MRI->getRegClass(MI.getOperand(0).getReg())->TSFlags) && "All supported instructions produce a vector register result"); LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n"); diff --git a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp index bbf1d87..cfe7ef4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAPI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAPI.cpp @@ -116,8 +116,8 @@ SPIRVTranslate(Module *M, std::string &SpirvObj, std::string &ErrMsg, PM.add(new TargetLibraryInfoWrapperPass(TLII)); std::unique_ptr<MachineModuleInfoWrapperPass> MMIWP( new MachineModuleInfoWrapperPass(Target.get())); - const_cast<TargetLoweringObjectFile *>(Target->getObjFileLowering()) - ->Initialize(MMIWP->getMMI().getContext(), *Target); + Target->getObjFileLowering()->Initialize(MMIWP->getMMI().getContext(), + *Target); SmallString<4096> OutBuffer; raw_svector_ostream OutStream(OutBuffer); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index b90e1aa..3c631ce 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -665,10 +665,10 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper( auto *HandleType = cast<TargetExtType>(II->getOperand(0)->getType()); if (HandleType->getTargetExtName() == "spirv.Image" || HandleType->getTargetExtName() == "spirv.SignedImage") { - if (II->hasOneUse()) { - auto *U = *II->users().begin(); + for (User *U : II->users()) { Ty = cast<Instruction>(U)->getAccessType(); - assert(Ty && "Unable to get type for resource pointer."); + if (Ty) + break; } } else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") { // This call is supposed to index into an array diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 4a9c88b..a95c4ff 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/SparcFixupKinds.h" -#include "MCTargetDesc/SparcMCAsmInfo.h" #include "MCTargetDesc/SparcMCTargetDesc.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp index 1ee6e80..79da53e 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp @@ -13,10 +13,7 @@ #include "MCTargetDesc/SparcMCAsmInfo.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCValue.h" using namespace llvm; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp index 3ef6030..72bb372 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp @@ -69,8 +69,8 @@ void SystemZHLASMAsmStreamer::EmitEOL() { void SystemZHLASMAsmStreamer::changeSection(MCSection *Section, uint32_t Subsection) { - Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, - Subsection); + MAI->printSwitchToSection(*Section, Subsection, + getContext().getTargetTriple(), OS); MCStreamer::changeSection(Section, Subsection); } diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 19c9e9c..6ae69a4 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -900,7 +900,8 @@ public: bool checkDataSection() { if (CurrentState != DataSection) { - auto *WS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly()); + auto *WS = static_cast<const MCSectionWasm *>( + getStreamer().getCurrentSectionOnly()); if (WS && WS->isText()) return error("data directive must occur in a data segment: ", Lexer.getTok()); @@ -1218,7 +1219,8 @@ public: void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override { // Code below only applies to labels in text sections. - auto *CWS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly()); + auto *CWS = static_cast<const MCSectionWasm *>( + getStreamer().getCurrentSectionOnly()); if (!CWS->isText()) return; diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td index 13603f8..a606209 100644 --- a/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -71,6 +71,7 @@ def FeatureReferenceTypes : SubtargetFeature<"reference-types", "HasReferenceTypes", "true", "Enable reference types">; +def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">; def FeatureRelaxedSIMD : SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD", "Enable relaxed-simd instructions">; @@ -136,13 +137,13 @@ def : ProcessorModel<"lime1", NoSchedModel, // Latest and greatest experimental version of WebAssembly. Bugs included! def : ProcessorModel<"bleeding-edge", NoSchedModel, - [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt, - FeatureCallIndirectOverlong, FeatureExceptionHandling, - FeatureExtendedConst, FeatureFP16, FeatureMultiMemory, - FeatureMultivalue, FeatureMutableGlobals, - FeatureNontrappingFPToInt, FeatureRelaxedSIMD, - FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt, - FeatureTailCall]>; + [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt, + FeatureCallIndirectOverlong, FeatureExceptionHandling, + FeatureExtendedConst, FeatureFP16, FeatureMultiMemory, + FeatureMultivalue, FeatureMutableGlobals, + FeatureNontrappingFPToInt, FeatureRelaxedSIMD, + FeatureReferenceTypes, FeatureGC, FeatureSIMD128, + FeatureSignExt, FeatureTailCall]>; //===----------------------------------------------------------------------===// // Target Declaration diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index fa5776c..b03b350 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -123,7 +123,7 @@ static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) { static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL, SmallVector<MVT, 4> &Returns, SmallVector<MVT, 4> &Params) { - auto toWasmValType = [&](MVT VT) { + auto toWasmValType = [](MVT VT) { if (VT == MVT::i32) { return wasm::ValType::I32; } @@ -244,10 +244,18 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { MCSymbol *Table = WebAssembly::getOrCreateFunctionTableSymbol( MF.getContext(), Subtarget); SDValue TableSym = CurDAG->getMCSymbol(Table, PtrVT); - SDValue FuncRef = SDValue( - CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL, - MVT::funcref, TableSym, Node->getOperand(1)), - 0); + SDValue FuncPtr = Node->getOperand(1); + if (Subtarget->hasAddr64() && FuncPtr.getValueType() == MVT::i64) { + // table.get expects an i32 but on 64 bit platforms the function pointer + // is an i64. In that case, i32.wrap_i64 to convert. + FuncPtr = SDValue(CurDAG->getMachineNode(WebAssembly::I32_WRAP_I64, DL, + MVT::i32, FuncPtr), + 0); + } + SDValue FuncRef = + SDValue(CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL, + MVT::funcref, TableSym, FuncPtr), + 0); // Encode the signature information into the type index placeholder. // This gets decoded and converted into the actual type signature in diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 11936a3..cd434f7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -288,7 +288,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Expand float operations supported for scalars but not SIMD for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, - ISD::FEXP, ISD::FEXP2}) + ISD::FEXP, ISD::FEXP2, ISD::FEXP10}) for (auto T : {MVT::v4f32, MVT::v2f64}) setOperationAction(Op, T, Expand); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index b5e723e..2b632fd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -76,6 +76,9 @@ def HasReferenceTypes : Predicate<"Subtarget->hasReferenceTypes()">, AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">; +def HasGC : Predicate<"Subtarget->hasGC()">, + AssemblerPredicate<(all_of FeatureGC), "gc">; + def HasRelaxedSIMD : Predicate<"Subtarget->hasRelaxedSIMD()">, AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td index 40b87a0..fc82e5b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -36,13 +36,10 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> { Requires<[HasReferenceTypes]>; } -defm REF_TEST_FUNCREF : - I<(outs I32: $res), - (ins TypeIndex:$type, FUNCREF: $ref), - (outs), - (ins TypeIndex:$type), - [], - "ref.test\t$type, $ref", "ref.test $type", 0xfb14>; +defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref), + (outs), (ins TypeIndex:$type), [], + "ref.test\t$type, $ref", "ref.test $type", 0xfb14>, + Requires<[HasGC]>; defm "" : REF_I<FUNCREF, funcref, "func">; defm "" : REF_I<EXTERNREF, externref, "extern">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp index 7912aeb..ffd135d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp @@ -63,8 +63,10 @@ void OptimizeReturned::visitCallBase(CallBase &CB) { if (isa<Constant>(Arg)) continue; // Like replaceDominatedUsesWith but using Instruction/Use dominance. - Arg->replaceUsesWithIf(&CB, - [&](Use &U) { return DT->dominates(&CB, U); }); + Arg->replaceUsesWithIf(&CB, [&](Use &U) { + auto *I = cast<Instruction>(U.getUser()); + return !I->isLifetimeStartOrEnd() && DT->dominates(&CB, U); + }); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp index 40ea48a..a3ce40f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp @@ -43,6 +43,11 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU, Bits.set(WebAssembly::FeatureBulkMemoryOpt); } + // gc implies reference-types + if (HasGC) { + HasReferenceTypes = true; + } + // reference-types implies call-indirect-overlong if (HasReferenceTypes) { HasCallIndirectOverlong = true; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index 591ce256..f814274 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -51,6 +51,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { bool HasMutableGlobals = false; bool HasNontrappingFPToInt = false; bool HasReferenceTypes = false; + bool HasGC = false; bool HasSignExt = false; bool HasTailCall = false; bool HasWideArithmetic = false; @@ -107,6 +108,7 @@ public: bool hasMutableGlobals() const { return HasMutableGlobals; } bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; } bool hasReferenceTypes() const { return HasReferenceTypes; } + bool hasGC() const { return HasGC; } bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; } bool hasSignExt() const { return HasSignExt; } bool hasSIMD128() const { return SIMDLevel >= SIMD128; } diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index b642c1c..d7671ed 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1042,8 +1042,8 @@ private: } PrevState = CurrState; } - void onRParen() { - PrevState = State; + bool onRParen(StringRef &ErrMsg) { + IntelExprState CurrState = State; switch (State) { default: State = IES_ERROR; @@ -1054,9 +1054,27 @@ private: case IES_RBRAC: case IES_RPAREN: State = IES_RPAREN; + // In the case of a multiply, onRegister has already set IndexReg + // directly, with appropriate scale. + // Otherwise if we just saw a register it has only been stored in + // TmpReg, so we need to store it into the state machine. + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // no explicit scale. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + if (IndexReg) + return regsUseUpError(ErrMsg); + IndexReg = TmpReg; + Scale = 0; + } + } IC.pushOperator(IC_RPAREN); break; } + PrevState = CurrState; + return false; } bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID, const InlineAsmIdentifierInfo &IDInfo, @@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } break; case AsmToken::LParen: SM.onLParen(); break; - case AsmToken::RParen: SM.onRParen(); break; + case AsmToken::RParen: + if (SM.onRParen(ErrMsg)) { + return Error(Tok.getLoc(), ErrMsg); + } + break; } if (SM.hadError()) return Error(Tok.getLoc(), "unknown token in expression"); @@ -4781,7 +4803,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) { getStreamer().initSections(false, getSTI()); Section = getStreamer().getCurrentSectionOnly(); } - if (Section->useCodeAlign()) + if (getContext().getAsmInfo()->useCodeAlign(*Section)) getStreamer().emitCodeAlignment(Align(2), &getSTI(), 0); else getStreamer().emitValueToAlignment(Align(2), 0, 1, 0); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index e213923..7f9d474 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -388,36 +388,6 @@ static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) { return false; } -/// Check if the instruction to be emitted is right after any data. -static bool -isRightAfterData(MCFragment *CurrentFragment, - const std::pair<MCFragment *, size_t> &PrevInstPosition) { - MCFragment *F = CurrentFragment; - // Since data is always emitted into a DataFragment, our check strategy is - // simple here. - // - If the fragment is a DataFragment - // - If it's empty (section start or data after align), return false. - // - If it's not the fragment where the previous instruction is, - // returns true. - // - If it's the fragment holding the previous instruction but its - // size changed since the previous instruction was emitted into - // it, returns true. - // - Otherwise returns false. - // - If the fragment is not a DataFragment, returns false. - if (F->getKind() == MCFragment::FT_Data) - return F->getFixedSize() && (F != PrevInstPosition.first || - F->getFixedSize() != PrevInstPosition.second); - - return false; -} - -/// \returns the fragment size if it has instructions, otherwise returns 0. -static size_t getSizeForInstFragment(const MCFragment *F) { - if (!F || !F->hasInstructions()) - return 0; - return F->getSize(); -} - /// Return true if we can insert NOP or prefixes automatically before the /// the instruction to be emitted. bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { @@ -441,9 +411,11 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { // semantic. return false; - if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition)) - // If this instruction follows any data, there is no clear - // instruction boundary, inserting a nop/prefix would change semantic. + // If this instruction follows any data, there is no clear instruction + // boundary, inserting a nop/prefix would change semantic. + auto Offset = OS.getCurFragSize(); + if (Offset && (OS.getCurrentFragment() != PrevInstPosition.first || + Offset != PrevInstPosition.second)) return false; return true; @@ -552,7 +524,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, // Update PrevInstOpcode here, canPadInst() reads that. MCFragment *CF = OS.getCurrentFragment(); PrevInstOpcode = Inst.getOpcode(); - PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); + PrevInstPosition = std::make_pair(CF, OS.getCurFragSize()); if (!canPadBranches(OS)) return; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index f5eeb3b..d691538 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "X86MCAsmInfo.h" -#include "MCTargetDesc/X86MCExpr.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index 620526ff..3f2a433 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -12,8 +12,52 @@ // NOTE: NO INCLUDE GUARD DESIRED! +#ifndef DUMMY_FUNCTION_PASS +#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) +#endif +DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) +DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this)) +DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) +DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) +#undef DUMMY_FUNCTION_PASS + #ifndef MACHINE_FUNCTION_PASS #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this)) #undef MACHINE_FUNCTION_PASS + +#ifndef DUMMY_MACHINE_FUNCTION_PASS +#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME) +#endif +DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization()) +DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS()) +DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment()) +DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander()) +DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix()) +DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-LEAs", FixupLEAPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-bw-inst", FixupBWInstPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-inst-tuning", X86FixupInstTuningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-setcc", X86FixupSetCCPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-vector-constants", X86FixupVectorConstantsPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-flags-copy-lowering", X86FlagsCopyLoweringPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-lower-tile-copy", X86LowerTileCopy()) +DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-load", X86LoadValueInjectionLoadHardeningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-ret", X86LoadValueInjectionRetHardeningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-optimize-LEAs", X86OptimizeLEAPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-pseudo", X86ExpandPseudo()) +DUMMY_MACHINE_FUNCTION_PASS("x86-return-thunks", X86ReturnThunks()) +DUMMY_MACHINE_FUNCTION_PASS("x86-seses", X86SpeculativeExecutionSideEffectSuppression()) +DUMMY_MACHINE_FUNCTION_PASS("x86-slh", X86SpeculativeLoadHardeningPass()) +DUMMY_MACHINE_FUNCTION_PASS("x86-suppress-apx-for-relocation", X86SuppressAPXForRelocationPass()) +DUMMY_MACHINE_FUNCTION_PASS("tile-pre-config", X86PreTileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("tileconfig", X86TileConfig()) +DUMMY_MACHINE_FUNCTION_PASS("x86-wineh-unwindv2", X86WinEHUnwindV2()) +DUMMY_MACHINE_FUNCTION_PASS("x86argumentstackrebase", X86ArgumentStackSlotPass()) +#undef DUMMY_MACHINE_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 37a7b37..90791fc 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1838,14 +1838,15 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX512BWShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb + { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb - { TTI::SK_Reverse, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw - { TTI::SK_Reverse, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw + { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw + { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw - { TTI::SK_Reverse, MVT::v64i8, { 2, 2, 2, 2 } }, // pshufb + vshufi64x2 + { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw + { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw @@ -1874,18 +1875,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX512ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd - {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss - {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq - {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd - {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb - - {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd - {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps - {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq - {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd + {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd + {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd + {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss + {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss + {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq + {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq + {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd + {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd + {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb + {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb + + {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd + {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps + {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq + {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca @@ -1973,21 +1981,24 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX2ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v4f64, { 1, 1, 1, 1 } }, // vbroadcastpd - { TTI::SK_Broadcast, MVT::v8f32, { 1, 1, 1, 1 } }, // vbroadcastps - { TTI::SK_Broadcast, MVT::v4i64, { 1, 1, 1, 1 } }, // vpbroadcastq - { TTI::SK_Broadcast, MVT::v8i32, { 1, 1, 1, 1 } }, // vpbroadcastd - { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw - { TTI::SK_Broadcast, MVT::v32i8, { 1, 1, 1, 1 } }, // vpbroadcastb - - { TTI::SK_Reverse, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd - { TTI::SK_Reverse, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps - { TTI::SK_Reverse, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq - { TTI::SK_Reverse, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd - { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb - { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb - { TTI::SK_Reverse, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb + { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd + { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb + { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb + + { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd + { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps + { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq + { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd + { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb + { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb + { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb @@ -2077,23 +2088,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry AVX1ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Broadcast, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Broadcast, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Broadcast, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128 - {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128 - {TTI::SK_Broadcast, MVT::v32i8, {2,2,2,2}}, // vpshufb + vinsertf128 - - {TTI::SK_Reverse, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Reverse, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Reverse, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd - {TTI::SK_Reverse, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps - {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb + {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd + {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps + {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd + {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps + {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128 + + {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd + {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps + {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd + {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps + {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb // + vinsertf128 - {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb + {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb // + vinsertf128 - {TTI::SK_Reverse, MVT::v32i8, {4,4,4,4}}, // vextractf128 + 2*pshufb + {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb // + vinsertf128 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd @@ -2156,13 +2167,13 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return LT.first * *KindCost; static const CostKindTblEntry SSSE3ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb + {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb + {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb + {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb - {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb - {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb + {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb + {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb + {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por @@ -2192,16 +2203,16 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd - {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd - {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd - {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd - {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd - {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd - {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw + {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 4ca7444..e5c896f 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["permlane16-swap"] = true; Features["ashr-pk-insts"] = true; Features["atomic-buffer-pk-add-bf16-inst"] = true; + Features["vmem-pref-insts"] = true; Features["atomic-fadd-rtn-insts"] = true; Features["atomic-buffer-global-pk-add-f16-insts"] = true; Features["atomic-flat-pk-add-16-insts"] = true; diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index be51453..ee6651c 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -8,7 +8,6 @@ #include "llvm/TargetParser/Triple.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/CodeGen.h" diff --git a/llvm/lib/TextAPI/SymbolSet.cpp b/llvm/lib/TextAPI/SymbolSet.cpp index 2e0b416..f21a061 100644 --- a/llvm/lib/TextAPI/SymbolSet.cpp +++ b/llvm/lib/TextAPI/SymbolSet.cpp @@ -11,6 +11,11 @@ using namespace llvm; using namespace llvm::MachO; +SymbolSet::~SymbolSet() { + for (auto &[Key, Sym] : Symbols) + Sym->~Symbol(); +} + Symbol *SymbolSet::addGlobalImpl(EncodeKind Kind, StringRef Name, SymbolFlags Flags) { Name = copyString(Name); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 59ae057..ac93f748 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -85,6 +85,9 @@ static Intrinsic::ID NonOverloadedCoroIntrinsics[] = { Intrinsic::coro_id_async, Intrinsic::coro_id_retcon, Intrinsic::coro_id_retcon_once, + Intrinsic::coro_noop, + Intrinsic::coro_prepare_async, + Intrinsic::coro_prepare_retcon, Intrinsic::coro_promise, Intrinsic::coro_resume, Intrinsic::coro_save, diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp index 5a87cf8..b3910c4 100644 --- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp +++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp @@ -48,6 +48,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -63,7 +64,7 @@ static inline void eraseFromModule(T &ToErase) { ToErase.eraseFromParent(); } -static inline bool checkIfSupported(GlobalVariable &G) { +static bool checkIfSupported(GlobalVariable &G) { if (!G.isThreadLocal()) return true; @@ -114,24 +115,221 @@ static inline void clearModule(Module &M) { // TODO: simplify. eraseFromModule(*M.ifuncs().begin()); } +static SmallVector<std::reference_wrapper<Use>> +collectIndirectableUses(GlobalVariable *G) { + // We are interested only in use chains that end in an Instruction. + SmallVector<std::reference_wrapper<Use>> Uses; + + SmallVector<std::reference_wrapper<Use>> Stack(G->use_begin(), G->use_end()); + while (!Stack.empty()) { + Use &U = Stack.pop_back_val(); + if (isa<Instruction>(U.getUser())) + Uses.emplace_back(U); + else + transform(U.getUser()->uses(), std::back_inserter(Stack), + [](auto &&U) { return std::ref(U); }); + } + + return Uses; +} + +static inline GlobalVariable *getGlobalForName(GlobalVariable *G) { + // Create an anonymous global which stores the variable's name, which will be + // used by the HIPSTDPAR runtime to look up the program-wide symbol. + LLVMContext &Ctx = G->getContext(); + auto *CDS = ConstantDataArray::getString(Ctx, G->getName()); + + GlobalVariable *N = G->getParent()->getOrInsertGlobal("", CDS->getType()); + N->setInitializer(CDS); + N->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage); + N->setConstant(true); + + return N; +} + +static inline GlobalVariable *getIndirectionGlobal(Module *M) { + // Create an anonymous global which stores a pointer to a pointer, which will + // be externally initialised by the HIPSTDPAR runtime with the address of the + // program-wide symbol. + Type *PtrTy = PointerType::get( + M->getContext(), M->getDataLayout().getDefaultGlobalsAddressSpace()); + GlobalVariable *NewG = M->getOrInsertGlobal("", PtrTy); + + NewG->setInitializer(PoisonValue::get(NewG->getValueType())); + NewG->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage); + NewG->setConstant(true); + NewG->setExternallyInitialized(true); + + return NewG; +} + +static Constant * +appendIndirectedGlobal(const GlobalVariable *IndirectionTable, + SmallVector<Constant *> &SymbolIndirections, + GlobalVariable *ToIndirect) { + Module *M = ToIndirect->getParent(); + + auto *InitTy = cast<StructType>(IndirectionTable->getValueType()); + auto *SymbolListTy = cast<StructType>(InitTy->getStructElementType(2)); + Type *NameTy = SymbolListTy->getElementType(0); + Type *IndirectTy = SymbolListTy->getElementType(1); + + Constant *NameG = getGlobalForName(ToIndirect); + Constant *IndirectG = getIndirectionGlobal(M); + Constant *Entry = ConstantStruct::get( + SymbolListTy, {ConstantExpr::getAddrSpaceCast(NameG, NameTy), + ConstantExpr::getAddrSpaceCast(IndirectG, IndirectTy)}); + SymbolIndirections.push_back(Entry); + + return IndirectG; +} + +static void fillIndirectionTable(GlobalVariable *IndirectionTable, + SmallVector<Constant *> Indirections) { + Module *M = IndirectionTable->getParent(); + size_t SymCnt = Indirections.size(); + + auto *InitTy = cast<StructType>(IndirectionTable->getValueType()); + Type *SymbolListTy = InitTy->getStructElementType(1); + auto *SymbolTy = cast<StructType>(InitTy->getStructElementType(2)); + + Constant *Count = ConstantInt::get(InitTy->getStructElementType(0), SymCnt); + M->removeGlobalVariable(IndirectionTable); + GlobalVariable *Symbols = + M->getOrInsertGlobal("", ArrayType::get(SymbolTy, SymCnt)); + Symbols->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage); + Symbols->setInitializer( + ConstantArray::get(ArrayType::get(SymbolTy, SymCnt), {Indirections})); + Symbols->setConstant(true); + + Constant *ASCSymbols = ConstantExpr::getAddrSpaceCast(Symbols, SymbolListTy); + Constant *Init = ConstantStruct::get( + InitTy, {Count, ASCSymbols, PoisonValue::get(SymbolTy)}); + M->insertGlobalVariable(IndirectionTable); + IndirectionTable->setInitializer(Init); +} + +static void replaceWithIndirectUse(const Use &U, const GlobalVariable *G, + Constant *IndirectedG) { + auto *I = cast<Instruction>(U.getUser()); + + IRBuilder<> Builder(I); + unsigned OpIdx = U.getOperandNo(); + Value *Op = I->getOperand(OpIdx); + + // We walk back up the use chain, which could be an arbitrarily long sequence + // of constexpr AS casts, ptr-to-int and GEP instructions, until we reach the + // indirected global. + while (auto *CE = dyn_cast<ConstantExpr>(Op)) { + assert((CE->getOpcode() == Instruction::GetElementPtr || + CE->getOpcode() == Instruction::AddrSpaceCast || + CE->getOpcode() == Instruction::PtrToInt) && + "Only GEP, ASCAST or PTRTOINT constant uses supported!"); + + Instruction *NewI = Builder.Insert(CE->getAsInstruction()); + I->replaceUsesOfWith(Op, NewI); + I = NewI; + Op = I->getOperand(0); + OpIdx = 0; + Builder.SetInsertPoint(I); + } + + assert(Op == G && "Must reach indirected global!"); + + I->setOperand(OpIdx, Builder.CreateLoad(G->getType(), IndirectedG)); +} + +static inline bool isValidIndirectionTable(GlobalVariable *IndirectionTable) { + std::string W; + raw_string_ostream OS(W); + + Type *Ty = IndirectionTable->getValueType(); + bool Valid = false; + + if (!isa<StructType>(Ty)) { + OS << "The Indirection Table must be a struct type; "; + Ty->print(OS); + OS << " is incorrect.\n"; + } else if (cast<StructType>(Ty)->getNumElements() != 3u) { + OS << "The Indirection Table must have 3 elements; " + << cast<StructType>(Ty)->getNumElements() << " is incorrect.\n"; + } else if (!isa<IntegerType>(cast<StructType>(Ty)->getStructElementType(0))) { + OS << "The first element in the Indirection Table must be an integer; "; + cast<StructType>(Ty)->getStructElementType(0)->print(OS); + OS << " is incorrect.\n"; + } else if (!isa<PointerType>(cast<StructType>(Ty)->getStructElementType(1))) { + OS << "The second element in the Indirection Table must be a pointer; "; + cast<StructType>(Ty)->getStructElementType(1)->print(OS); + OS << " is incorrect.\n"; + } else if (!isa<StructType>(cast<StructType>(Ty)->getStructElementType(2))) { + OS << "The third element in the Indirection Table must be a struct type; "; + cast<StructType>(Ty)->getStructElementType(2)->print(OS); + OS << " is incorrect.\n"; + } else { + Valid = true; + } + + if (!Valid) + IndirectionTable->getContext().diagnose(DiagnosticInfoGeneric(W, DS_Error)); + + return Valid; +} + +static void indirectGlobals(GlobalVariable *IndirectionTable, + SmallVector<GlobalVariable *> ToIndirect) { + // We replace globals with an indirected access via a pointer that will get + // set by the HIPSTDPAR runtime, using their accessible, program-wide unique + // address as set by the host linker-loader. + SmallVector<Constant *> SymbolIndirections; + for (auto &&G : ToIndirect) { + SmallVector<std::reference_wrapper<Use>> Uses = collectIndirectableUses(G); + + if (Uses.empty()) + continue; + + Constant *IndirectedGlobal = + appendIndirectedGlobal(IndirectionTable, SymbolIndirections, G); + + for_each(Uses, + [=](auto &&U) { replaceWithIndirectUse(U, G, IndirectedGlobal); }); + + eraseFromModule(*G); + } + + if (SymbolIndirections.empty()) + return; + + fillIndirectionTable(IndirectionTable, std::move(SymbolIndirections)); +} + static inline void maybeHandleGlobals(Module &M) { unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); - for (auto &&G : M.globals()) { // TODO: should we handle these in the FE? + + SmallVector<GlobalVariable *> ToIndirect; + for (auto &&G : M.globals()) { if (!checkIfSupported(G)) return clearModule(M); - - if (G.isThreadLocal()) - continue; - if (G.isConstant()) - continue; if (G.getAddressSpace() != GlobAS) continue; - if (G.getLinkage() != GlobalVariable::ExternalLinkage) + if (G.isConstant() && G.hasInitializer() && G.hasAtLeastLocalUnnamedAddr()) continue; - G.setLinkage(GlobalVariable::ExternalWeakLinkage); - G.setInitializer(nullptr); - G.setExternallyInitialized(true); + ToIndirect.push_back(&G); + } + + if (ToIndirect.empty()) + return; + + if (auto *IT = M.getNamedGlobal("__hipstdpar_symbol_indirection_table")) { + if (!isValidIndirectionTable(IT)) + return clearModule(M); + return indirectGlobals(IT, std::move(ToIndirect)); + } else { + for (auto &&G : ToIndirect) { + // We will internalise these, so we provide a poison initialiser. + if (!G->hasInitializer()) + G->setInitializer(PoisonValue::get(G->getValueType())); + } } } diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index b803c97..0164fcd 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -2073,14 +2073,14 @@ std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, unsigned CloneNo) const { auto VI = FSToVIMap.find(Func); assert(VI != FSToVIMap.end()); + std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo); if (isa<AllocInfo *>(Call)) - return (VI->second.name() + " -> alloc").str(); + return CallerName + " -> alloc"; else { auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call); - return (VI->second.name() + " -> " + - getMemProfFuncName(Callsite->Callee.name(), - Callsite->Clones[CloneNo])) - .str(); + return CallerName + " -> " + + getMemProfFuncName(Callsite->Callee.name(), + Callsite->Clones[CloneNo]); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index d88bc2c..1b78ace 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1830,10 +1830,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { bool IntMinIsPoison = cast<Constant>(II->getArgOperand(1))->isOneValue(); // abs(-x) -> abs(x) - // TODO: Copy nsw if it was present on the neg? Value *X; - if (match(IIOperand, m_Neg(m_Value(X)))) + if (match(IIOperand, m_Neg(m_Value(X)))) { + if (cast<Instruction>(IIOperand)->hasNoSignedWrap() || IntMinIsPoison) + replaceOperand(*II, 1, Builder.getTrue()); return replaceOperand(*II, 0, X); + } if (match(IIOperand, m_c_Select(m_Neg(m_Value(X)), m_Deferred(X)))) return replaceOperand(*II, 0, X); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index c5e1b04..da9b126 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -755,8 +755,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // If the base pointers are different, but the indices are the same, just // compare the base pointer. - Value *PtrBase = GEPLHS->getOperand(0); - if (PtrBase != GEPRHS->getOperand(0)) { + if (GEPLHS->getOperand(0) != GEPRHS->getOperand(0)) { bool IndicesTheSame = GEPLHS->getNumOperands() == GEPRHS->getNumOperands() && GEPLHS->getPointerOperand()->getType() == @@ -782,7 +781,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, if (GEPLHS->isInBounds() && GEPRHS->isInBounds() && (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) && - PtrBase->stripPointerCasts() == + GEPLHS->getOperand(0)->stripPointerCasts() == GEPRHS->getOperand(0)->stripPointerCasts() && !GEPLHS->getType()->isVectorTy()) { Value *LOffset = EmitGEPOffset(GEPLHS); @@ -805,14 +804,10 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, LOffset, ROffset); return replaceInstUsesWith(I, Cmp); } - - // Otherwise, the base pointers are different and the indices are - // different. Try convert this to an indexed compare by looking through - // PHIs/casts. - return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this); } - if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() && + if (GEPLHS->getOperand(0) == GEPRHS->getOperand(0) && + GEPLHS->getNumOperands() == GEPRHS->getNumOperands() && GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) { // If the GEPs only differ by one index, compare it. unsigned NumDifferences = 0; // Keep track of # differences. @@ -849,11 +844,14 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, } } - if (CanFold(NW)) { + if (Base.Ptr && CanFold(Base.LHSNW & Base.RHSNW) && !Base.isExpensive()) { // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) - Value *L = EmitGEPOffset(GEPLHS, /*RewriteGEP=*/true); - Value *R = EmitGEPOffset(GEPRHS, /*RewriteGEP=*/true); - return NewICmp(NW, L, R); + Type *IdxTy = DL.getIndexType(GEPLHS->getType()); + Value *L = + EmitGEPOffsets(Base.LHSGEPs, Base.LHSNW, IdxTy, /*RewriteGEP=*/true); + Value *R = + EmitGEPOffsets(Base.RHSGEPs, Base.RHSNW, IdxTy, /*RewriteGEP=*/true); + return NewICmp(Base.LHSNW & Base.RHSNW, L, R); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 2cc1bc9..0be1034 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -12,7 +12,6 @@ #include "InstCombineInternal.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -1503,8 +1502,7 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) { // This is a non-terminator unreachable marker. Don't remove it. if (isa<UndefValue>(Ptr)) { // Remove guaranteed-to-transfer instructions before the marker. - if (removeInstructionsBeforeUnreachable(SI)) - return &SI; + removeInstructionsBeforeUnreachable(SI); // Remove all instructions after the marker and handle dead blocks this // implies. diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 5849c3e..4e5a8d1 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -363,10 +363,10 @@ private: void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size); Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag); Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong); - bool instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag, + void instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag, const DominatorTree &DT, const PostDominatorTree &PDT, const LoopInfo &LI); - bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec); + void instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec); Value *getNextTagWithCall(IRBuilder<> &IRB); Value *getStackBaseTag(IRBuilder<> &IRB); Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, unsigned AllocaNo); @@ -1418,7 +1418,7 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { } } -bool HWAddressSanitizer::instrumentLandingPads( +void HWAddressSanitizer::instrumentLandingPads( SmallVectorImpl<Instruction *> &LandingPadVec) { for (auto *LP : LandingPadVec) { IRBuilder<> IRB(LP->getNextNode()); @@ -1427,10 +1427,9 @@ bool HWAddressSanitizer::instrumentLandingPads( {memtag::readRegister( IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" : "sp")}); } - return true; } -bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, +void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, Value *StackTag, Value *UARTag, const DominatorTree &DT, const PostDominatorTree &PDT, @@ -1460,8 +1459,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, size_t Size = memtag::getAllocaSizeInBytes(*AI); size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); - Value *AICast = IRB.CreatePointerCast(AI, PtrTy); - auto HandleLifetime = [&](IntrinsicInst *II) { // Set the lifetime intrinsic to cover the whole alloca. This reduces the // set of assumptions we need to make about the lifetime. Without this we @@ -1474,14 +1471,13 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, // one set of start / end in any execution (i.e. the ends are not // reachable from each other), so this will not cause any problems. II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize)); - II->setArgOperand(1, AICast); }; llvm::for_each(Info.LifetimeStart, HandleLifetime); llvm::for_each(Info.LifetimeEnd, HandleLifetime); - AI->replaceUsesWithIf(Replacement, [AICast, AILong](const Use &U) { + AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) { auto *User = U.getUser(); - return User != AILong && User != AICast && !isa<LifetimeIntrinsic>(User); + return User != AILong && !isa<LifetimeIntrinsic>(User); }); memtag::annotateDebugRecords(Info, retagMask(N)); @@ -1524,7 +1520,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, } memtag::alignAndPadAlloca(Info, Mapping.getObjectAlignment()); } - return true; } static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 854db0f..f451c2b 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -80,6 +80,27 @@ static cl::opt<unsigned> ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::desc("Skip Callsite up to this number for this compilation")); +// ICP the candidate function even when only a declaration is present. +static cl::opt<bool> ICPAllowDecls( + "icp-allow-decls", cl::init(false), cl::Hidden, + cl::desc("Promote the target candidate even when the defintion " + " is not available")); + +// ICP hot candidate functions only. When setting to false, non-cold functions +// (warm functions) can also be promoted. +static cl::opt<bool> + ICPAllowHotOnly("icp-allow-hot-only", cl::init(true), cl::Hidden, + cl::desc("Promote the target candidate only if it is a " + "hot function. Otherwise, warm functions can " + "also be promoted")); + +// If one target cannot be ICP'd, proceed with the remaining targets instead +// of exiting the callsite. +static cl::opt<bool> ICPAllowCandidateSkip( + "icp-allow-candidate-skip", cl::init(false), cl::Hidden, + cl::desc("Continue with the remaining targets instead of exiting " + "when failing in a candidate")); + // Set if the pass is called in LTO optimization. The difference for LTO mode // is the pass won't prefix the source module name to the internal linkage // symbols. @@ -330,6 +351,7 @@ private: struct PromotionCandidate { Function *const TargetFunction; const uint64_t Count; + const uint32_t Index; // The following fields only exists for promotion candidates with vtable // information. @@ -341,7 +363,8 @@ private: VTableGUIDCountsMap VTableGUIDAndCounts; SmallVector<Constant *> AddressPoints; - PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {} + PromotionCandidate(Function *F, uint64_t C, uint32_t I) + : TargetFunction(F), Count(C), Index(I) {} }; // Check if the indirect-call call site should be promoted. Return the number @@ -356,12 +379,10 @@ private: // Promote a list of targets for one indirect-call callsite by comparing // indirect callee with functions. Return true if there are IR // transformations and false otherwise. - bool tryToPromoteWithFuncCmp(CallBase &CB, Instruction *VPtr, - ArrayRef<PromotionCandidate> Candidates, - uint64_t TotalCount, - ArrayRef<InstrProfValueData> ICallProfDataRef, - uint32_t NumCandidates, - VTableGUIDCountsMap &VTableGUIDCounts); + bool tryToPromoteWithFuncCmp( + CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates, + uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef, + uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts); // Promote a list of targets for one indirect call by comparing vtables with // functions. Return true if there are IR transformations and false @@ -394,12 +415,15 @@ private: Constant *getOrCreateVTableAddressPointVar(GlobalVariable *GV, uint64_t AddressPointOffset); - void updateFuncValueProfiles(CallBase &CB, ArrayRef<InstrProfValueData> VDs, + void updateFuncValueProfiles(CallBase &CB, + MutableArrayRef<InstrProfValueData> VDs, uint64_t Sum, uint32_t MaxMDCount); void updateVPtrValueProfiles(Instruction *VPtr, VTableGUIDCountsMap &VTableGUIDCounts); + bool isValidTarget(uint64_t, Function *, const CallBase &, uint64_t); + public: IndirectCallPromoter( Function &Func, Module &M, InstrProfSymtab *Symtab, bool SamplePGO, @@ -419,6 +443,53 @@ public: } // end anonymous namespace +bool IndirectCallPromoter::isValidTarget(uint64_t Target, + Function *TargetFunction, + const CallBase &CB, uint64_t Count) { + // Don't promote if the symbol is not defined in the module. This avoids + // creating a reference to a symbol that doesn't exist in the module + // This can happen when we compile with a sample profile collected from + // one binary but used for another, which may have profiled targets that + // aren't used in the new binary. We might have a declaration initially in + // the case where the symbol is globally dead in the binary and removed by + // ThinLTO. + using namespace ore; + if (TargetFunction == nullptr) { + LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB) + << "Cannot promote indirect call: target with md5sum " + << NV("target md5sum", Target) + << " not found (count=" << NV("Count", Count) << ")"; + }); + return false; + } + if (!ICPAllowDecls && TargetFunction->isDeclaration()) { + LLVM_DEBUG(dbgs() << " Not promote: target definition is not available\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoTargetDef", &CB) + << "Do not promote indirect call: target with md5sum " + << NV("target md5sum", Target) + << " definition not available (count=" << ore::NV("Count", Count) + << ")"; + }); + return false; + } + + const char *Reason = nullptr; + if (!isLegalToPromote(CB, TargetFunction, &Reason)) { + + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB) + << "Cannot promote indirect call to " + << NV("TargetFunction", TargetFunction) + << " (count=" << NV("Count", Count) << "): " << Reason; + }); + return false; + } + return true; +} + // Indirect-call promotion heuristic. The direct targets are sorted based on // the count. Stop at the first target that is not promoted. std::vector<IndirectCallPromoter::PromotionCandidate> @@ -469,38 +540,15 @@ IndirectCallPromoter::getPromotionCandidatesForCallSite( break; } - // Don't promote if the symbol is not defined in the module. This avoids - // creating a reference to a symbol that doesn't exist in the module - // This can happen when we compile with a sample profile collected from - // one binary but used for another, which may have profiled targets that - // aren't used in the new binary. We might have a declaration initially in - // the case where the symbol is globally dead in the binary and removed by - // ThinLTO. Function *TargetFunction = Symtab->getFunction(Target); - if (TargetFunction == nullptr || TargetFunction->isDeclaration()) { - LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n"); - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB) - << "Cannot promote indirect call: target with md5sum " - << ore::NV("target md5sum", Target) << " not found"; - }); - break; - } - - const char *Reason = nullptr; - if (!isLegalToPromote(CB, TargetFunction, &Reason)) { - using namespace ore; - - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB) - << "Cannot promote indirect call to " - << NV("TargetFunction", TargetFunction) << " with count of " - << NV("Count", Count) << ": " << Reason; - }); - break; + if (!isValidTarget(Target, TargetFunction, CB, Count)) { + if (ICPAllowCandidateSkip) + continue; + else + break; } - Ret.push_back(PromotionCandidate(TargetFunction, Count)); + Ret.push_back(PromotionCandidate(TargetFunction, Count, I)); TotalCount -= Count; } return Ret; @@ -642,7 +690,7 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee, // Promote indirect-call to conditional direct-call for one callsite. bool IndirectCallPromoter::tryToPromoteWithFuncCmp( CallBase &CB, Instruction *VPtr, ArrayRef<PromotionCandidate> Candidates, - uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef, + uint64_t TotalCount, MutableArrayRef<InstrProfValueData> ICallProfDataRef, uint32_t NumCandidates, VTableGUIDCountsMap &VTableGUIDCounts) { uint32_t NumPromoted = 0; @@ -655,6 +703,8 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp( NumOfPGOICallPromotion++; NumPromoted++; + // Update the count and this entry will be erased later. + ICallProfDataRef[C.Index].Count = 0; if (!EnableVTableProfileUse || C.VTableGUIDAndCounts.empty()) continue; @@ -679,21 +729,33 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp( "Number of promoted functions should not be greater than the number " "of values in profile metadata"); - // Update value profiles on the indirect call. - updateFuncValueProfiles(CB, ICallProfDataRef.slice(NumPromoted), TotalCount, - NumCandidates); + updateFuncValueProfiles(CB, ICallProfDataRef, TotalCount, NumCandidates); updateVPtrValueProfiles(VPtr, VTableGUIDCounts); return true; } void IndirectCallPromoter::updateFuncValueProfiles( - CallBase &CB, ArrayRef<InstrProfValueData> CallVDs, uint64_t TotalCount, - uint32_t MaxMDCount) { + CallBase &CB, MutableArrayRef<InstrProfValueData> CallVDs, + uint64_t TotalCount, uint32_t MaxMDCount) { // First clear the existing !prof. CB.setMetadata(LLVMContext::MD_prof, nullptr); + + // Sort value profiles by count in descending order. + llvm::stable_sort(CallVDs, [](const InstrProfValueData &LHS, + const InstrProfValueData &RHS) { + return LHS.Count > RHS.Count; + }); + // Drop the <target-value, count> pair if count is zero. + ArrayRef<InstrProfValueData> VDs( + CallVDs.begin(), + llvm::upper_bound(CallVDs, 0U, + [](uint64_t Count, const InstrProfValueData &ProfData) { + return ProfData.Count <= Count; + })); + // Annotate the remaining value profiles if counter is not zero. if (TotalCount != 0) - annotateValueSite(M, CB, CallVDs, TotalCount, IPVK_IndirectCallTarget, + annotateValueSite(M, CB, VDs, TotalCount, IPVK_IndirectCallTarget, MaxMDCount); } @@ -726,7 +788,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp( uint64_t TotalFuncCount, uint32_t NumCandidates, MutableArrayRef<InstrProfValueData> ICallProfDataRef, VTableGUIDCountsMap &VTableGUIDCounts) { - SmallVector<uint64_t, 4> PromotedFuncCount; + SmallVector<std::pair<uint32_t, uint64_t>, 4> PromotedFuncCount; for (const auto &Candidate : Candidates) { for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts) @@ -771,7 +833,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp( return Remark; }); - PromotedFuncCount.push_back(Candidate.Count); + PromotedFuncCount.push_back({Candidate.Index, Candidate.Count}); assert(TotalFuncCount >= Candidate.Count && "Within one prof metadata, total count is the sum of counts from " @@ -792,22 +854,12 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp( // used to load multiple virtual functions. The vtable profiles needs to be // updated properly in that case (e.g, for each indirect call annotate both // type profiles and function profiles in one !prof). - for (size_t I = 0; I < PromotedFuncCount.size(); I++) - ICallProfDataRef[I].Count -= - std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count); - // Sort value profiles by count in descending order. - llvm::stable_sort(ICallProfDataRef, [](const InstrProfValueData &LHS, - const InstrProfValueData &RHS) { - return LHS.Count > RHS.Count; - }); - // Drop the <target-value, count> pair if count is zero. - ArrayRef<InstrProfValueData> VDs( - ICallProfDataRef.begin(), - llvm::upper_bound(ICallProfDataRef, 0U, - [](uint64_t Count, const InstrProfValueData &ProfData) { - return ProfData.Count <= Count; - })); - updateFuncValueProfiles(CB, VDs, TotalFuncCount, NumCandidates); + for (size_t I = 0; I < PromotedFuncCount.size(); I++) { + uint32_t Index = PromotedFuncCount[I].first; + ICallProfDataRef[Index].Count -= + std::max(PromotedFuncCount[I].second, ICallProfDataRef[Index].Count); + } + updateFuncValueProfiles(CB, ICallProfDataRef, TotalFuncCount, NumCandidates); updateVPtrValueProfiles(VPtr, VTableGUIDCounts); return true; } @@ -822,9 +874,22 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) { uint64_t TotalCount; auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction( CB, TotalCount, NumCandidates); - if (!NumCandidates || - (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount))) + if (!NumCandidates) continue; + if (PSI && PSI->hasProfileSummary()) { + // Don't promote cold candidates. + if (PSI->isColdCount(TotalCount)) { + LLVM_DEBUG(dbgs() << "Don't promote the cold candidate: TotalCount=" + << TotalCount << "\n"); + continue; + } + // Only pormote hot if ICPAllowHotOnly is true. + if (ICPAllowHotOnly && !PSI->isHotCount(TotalCount)) { + LLVM_DEBUG(dbgs() << "Don't promote the non-hot candidate: TotalCount=" + << TotalCount << "\n"); + continue; + } + } auto PromotionCandidates = getPromotionCandidatesForCallSite( *CB, ICallProfDataRef, TotalCount, NumCandidates); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 599b60d..df31f07 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -158,7 +158,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/bit.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index 3fa844e..6135c7b 100644 --- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -46,6 +46,8 @@ enum class ARCRuntimeEntryPointKind { UnsafeClaimRV, RetainAutorelease, RetainAutoreleaseRV, + AutoreleasePoolPush, + AutoreleasePoolPop, }; /// Declarations for ObjC runtime functions and constants. These are initialized @@ -67,6 +69,8 @@ public: UnsafeClaimRV = nullptr; RetainAutorelease = nullptr; RetainAutoreleaseRV = nullptr; + AutoreleasePoolPush = nullptr; + AutoreleasePoolPop = nullptr; } Function *get(ARCRuntimeEntryPointKind kind) { @@ -101,6 +105,12 @@ public: case ARCRuntimeEntryPointKind::RetainAutoreleaseRV: return getIntrinsicEntryPoint(RetainAutoreleaseRV, Intrinsic::objc_retainAutoreleaseReturnValue); + case ARCRuntimeEntryPointKind::AutoreleasePoolPush: + return getIntrinsicEntryPoint(AutoreleasePoolPush, + Intrinsic::objc_autoreleasePoolPush); + case ARCRuntimeEntryPointKind::AutoreleasePoolPop: + return getIntrinsicEntryPoint(AutoreleasePoolPop, + Intrinsic::objc_autoreleasePoolPop); } llvm_unreachable("Switch should be a covered switch."); @@ -143,6 +153,12 @@ private: /// Declaration for objc_retainAutoreleaseReturnValue(). Function *RetainAutoreleaseRV = nullptr; + /// Declaration for objc_autoreleasePoolPush(). + Function *AutoreleasePoolPush = nullptr; + + /// Declaration for objc_autoreleasePoolPop(). + Function *AutoreleasePoolPop = nullptr; + Function *getIntrinsicEntryPoint(Function *&Decl, Intrinsic::ID IntID) { if (Decl) return Decl; diff --git a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt index 80867db..4274667 100644 --- a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt +++ b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt @@ -2,7 +2,6 @@ add_llvm_component_library(LLVMObjCARCOpts ObjCARC.cpp ObjCARCOpts.cpp ObjCARCExpand.cpp - ObjCARCAPElim.cpp ObjCARCContract.cpp DependencyAnalysis.cpp ProvenanceAnalysis.cpp diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp deleted file mode 100644 index dceb2eb..0000000 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// -/// This file defines ObjC ARC optimizations. ARC stands for Automatic -/// Reference Counting and is a system for managing reference counts for objects -/// in Objective C. -/// -/// This specific file implements optimizations which remove extraneous -/// autorelease pools. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/STLExtras.h" -#include "llvm/Analysis/ObjCARCAnalysisUtils.h" -#include "llvm/Analysis/ObjCARCInstKind.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/ObjCARC.h" - -using namespace llvm; -using namespace llvm::objcarc; - -#define DEBUG_TYPE "objc-arc-ap-elim" - -namespace { - -/// Interprocedurally determine if calls made by the given call site can -/// possibly produce autoreleases. -bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) { - if (const Function *Callee = CB.getCalledFunction()) { - if (!Callee->hasExactDefinition()) - return true; - for (const BasicBlock &BB : *Callee) { - for (const Instruction &I : BB) - if (const CallBase *JCB = dyn_cast<CallBase>(&I)) - // This recursion depth limit is arbitrary. It's just great - // enough to cover known interesting testcases. - if (Depth < 3 && !JCB->onlyReadsMemory() && - MayAutorelease(*JCB, Depth + 1)) - return true; - } - return false; - } - - return true; -} - -bool OptimizeBB(BasicBlock *BB) { - bool Changed = false; - - Instruction *Push = nullptr; - for (Instruction &Inst : llvm::make_early_inc_range(*BB)) { - switch (GetBasicARCInstKind(&Inst)) { - case ARCInstKind::AutoreleasepoolPush: - Push = &Inst; - break; - case ARCInstKind::AutoreleasepoolPop: - // If this pop matches a push and nothing in between can autorelease, - // zap the pair. - if (Push && cast<CallInst>(&Inst)->getArgOperand(0) == Push) { - Changed = true; - LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop " - "autorelease pair:\n" - " Pop: " - << Inst << "\n" - << " Push: " << *Push - << "\n"); - Inst.eraseFromParent(); - Push->eraseFromParent(); - } - Push = nullptr; - break; - case ARCInstKind::CallOrUser: - if (MayAutorelease(cast<CallBase>(Inst))) - Push = nullptr; - break; - default: - break; - } - } - - return Changed; -} - -bool runImpl(Module &M) { - if (!EnableARCOpts) - return false; - - // If nothing in the Module uses ARC, don't do anything. - if (!ModuleHasARC(M)) - return false; - // Find the llvm.global_ctors variable, as the first step in - // identifying the global constructors. In theory, unnecessary autorelease - // pools could occur anywhere, but in practice it's pretty rare. Global - // ctors are a place where autorelease pools get inserted automatically, - // so it's pretty common for them to be unnecessary, and it's pretty - // profitable to eliminate them. - GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); - if (!GV) - return false; - - assert(GV->hasDefinitiveInitializer() && - "llvm.global_ctors is uncooperative!"); - - bool Changed = false; - - // Dig the constructor functions out of GV's initializer. - ConstantArray *Init = cast<ConstantArray>(GV->getInitializer()); - for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end(); - OI != OE; ++OI) { - Value *Op = *OI; - // llvm.global_ctors is an array of three-field structs where the second - // members are constructor functions. - Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); - // If the user used a constructor function with the wrong signature and - // it got bitcasted or whatever, look the other way. - if (!F) - continue; - // Only look at function definitions. - if (F->isDeclaration()) - continue; - // Only look at functions with one basic block. - if (std::next(F->begin()) != F->end()) - continue; - // Ok, a single-block constructor function definition. Try to optimize it. - Changed |= OptimizeBB(&F->front()); - } - - return Changed; -} - -} // namespace - -PreservedAnalyses ObjCARCAPElimPass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runImpl(M)) - return PreservedAnalyses::all(); - PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); - return PA; -} diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 5eb3f51..66a2c76 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -39,6 +39,7 @@ #include "llvm/Analysis/ObjCARCAnalysisUtils.h" #include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/Analysis/ObjCARCUtil.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" @@ -132,11 +133,8 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { // // The second retain and autorelease can be deleted. -// TODO: It should be possible to delete -// objc_autoreleasePoolPush and objc_autoreleasePoolPop -// pairs if nothing is actually autoreleased between them. Also, autorelease -// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code -// after inlining) can be turned into plain release calls. +// TODO: Autorelease calls followed by objc_autoreleasePoolPop calls (perhaps in +// ObjC++ code after inlining) can be turned into plain release calls. // TODO: Critical-edge splitting. If the optimial insertion point is // a critical edge, the current algorithm has to fail, because it doesn't @@ -566,6 +564,8 @@ class ObjCARCOpt { void OptimizeReturns(Function &F); + void OptimizeAutoreleasePools(Function &F); + template <typename PredicateT> static void cloneOpBundlesIf(CallBase *CI, SmallVectorImpl<OperandBundleDef> &OpBundles, @@ -2473,6 +2473,11 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) { (1 << unsigned(ARCInstKind::AutoreleaseRV)))) OptimizeReturns(F); + // Optimizations for autorelease pools. + if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::AutoreleasepoolPush)) | + (1 << unsigned(ARCInstKind::AutoreleasepoolPop)))) + OptimizeAutoreleasePools(F); + // Gather statistics after optimization. #ifndef NDEBUG if (AreStatisticsEnabled()) { @@ -2485,6 +2490,183 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) { return Changed; } +/// Interprocedurally determine if calls made by the given call site can +/// possibly produce autoreleases. +bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) { + if (CB.onlyReadsMemory()) + return false; + + // This recursion depth limit is arbitrary. It's just great + // enough to cover known interesting testcases. + if (Depth > 5) + return true; + + if (const Function *Callee = CB.getCalledFunction()) { + if (!Callee->hasExactDefinition()) + return true; + for (const BasicBlock &BB : *Callee) { + for (const Instruction &I : BB) { + // TODO: Ignore all instructions between autorelease pools + ARCInstKind InstKind = GetBasicARCInstKind(&I); + switch (InstKind) { + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeak: + // These may produce autoreleases + return true; + + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::UnsafeClaimRV: + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::NoopCast: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::AutoreleasepoolPush: + case ARCInstKind::AutoreleasepoolPop: + // These ObjC runtime functions don't produce autoreleases + break; + + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + // For non-ObjC function calls, recursively analyze + if (MayAutorelease(cast<CallBase>(I), Depth + 1)) + return true; + break; + + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: + case ARCInstKind::None: + // These are not relevant for autorelease analysis + break; + } + } + } + return false; + } + + return true; +} + +/// Optimize autorelease pools by eliminating empty push/pop pairs. +void ObjCARCOpt::OptimizeAutoreleasePools(Function &F) { + LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeAutoreleasePools ==\n"); + + OptimizationRemarkEmitter ORE(&F); + + // Process each basic block independently. + // TODO: Can we optimize inter-block autorelease pool pairs? + // This would involve tracking autorelease pool state across blocks. + for (BasicBlock &BB : F) { + // Use a stack to track nested autorelease pools + SmallVector<std::pair<CallInst *, bool>, 4> + PoolStack; // {push_inst, has_autorelease_in_scope} + + for (Instruction &Inst : llvm::make_early_inc_range(BB)) { + ARCInstKind Class = GetBasicARCInstKind(&Inst); + + switch (Class) { + case ARCInstKind::AutoreleasepoolPush: { + // Start tracking a new autorelease pool scope + auto *Push = cast<CallInst>(&Inst); + PoolStack.push_back( + {Push, false}); // {push_inst, has_autorelease_in_scope} + LLVM_DEBUG(dbgs() << "Found autorelease pool push: " << *Push << "\n"); + break; + } + + case ARCInstKind::AutoreleasepoolPop: { + auto *Pop = cast<CallInst>(&Inst); + + if (PoolStack.empty()) + break; + + auto &TopPool = PoolStack.back(); + CallInst *PendingPush = TopPool.first; + bool HasAutoreleaseInScope = TopPool.second; + + // Pop the stack - remove this pool scope + PoolStack.pop_back(); + + // Bail if this pop doesn't match the pending push + if (Pop->getArgOperand(0)->stripPointerCasts() != PendingPush) + break; + + // Bail if there were autoreleases in this scope + if (HasAutoreleaseInScope) + break; + + // Optimize: eliminate this empty autorelease pool pair + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "AutoreleasePoolElimination", + PendingPush) + << "eliminated empty autorelease pool pair"; + }); + + // Replace all uses of push with poison before deletion + PendingPush->replaceAllUsesWith( + PoisonValue::get(PendingPush->getType())); + + Pop->eraseFromParent(); + PendingPush->eraseFromParent(); + + Changed = true; + ++NumNoops; + break; + } + case ARCInstKind::CallOrUser: + case ARCInstKind::Call: + if (!MayAutorelease(cast<CallBase>(Inst))) + break; + LLVM_FALLTHROUGH; + case ARCInstKind::Autorelease: + case ARCInstKind::AutoreleaseRV: + case ARCInstKind::FusedRetainAutorelease: + case ARCInstKind::FusedRetainAutoreleaseRV: + case ARCInstKind::LoadWeak: { + // Track that we have autorelease calls in the current pool scope + if (!PoolStack.empty()) { + PoolStack.back().second = true; // Set has_autorelease_in_scope = true + LLVM_DEBUG( + dbgs() + << "Found autorelease or potential autorelease in pool scope: " + << Inst << "\n"); + } + break; + } + + // Enumerate all remaining ARCInstKind cases explicitly + case ARCInstKind::Retain: + case ARCInstKind::RetainRV: + case ARCInstKind::UnsafeClaimRV: + case ARCInstKind::RetainBlock: + case ARCInstKind::Release: + case ARCInstKind::NoopCast: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::StoreWeak: + case ARCInstKind::InitWeak: + case ARCInstKind::MoveWeak: + case ARCInstKind::CopyWeak: + case ARCInstKind::DestroyWeak: + case ARCInstKind::StoreStrong: + case ARCInstKind::IntrinsicUser: + case ARCInstKind::User: + case ARCInstKind::None: + // These instruction kinds don't affect autorelease pool optimization + break; + } + } + } +} + /// @} /// diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 8c84b0d..03b92d3 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -88,7 +88,6 @@ #include <cassert> #include <cstdint> #include <utility> -#include <vector> using namespace llvm; using namespace SCEVPatternMatch; diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 70e9eee..08446cc 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -17,8 +17,8 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSet.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -70,6 +70,13 @@ namespace { using LoopVector = SmallVector<Loop *, 8>; +/// A list of direction vectors. Each entry represents a direction vector +/// corresponding to one or more dependencies existing in the loop nest. The +/// length of all direction vectors is equal and is N + 1, where N is the depth +/// of the loop nest. The first N elements correspond to the dependency +/// direction of each N loops. The last one indicates whether this entry is +/// forward dependency ('<') or not ('*'). The term "forward" aligns with what +/// is defined in LoopAccessAnalysis. // TODO: Check if we can use a sparse matrix here. using CharMatrix = std::vector<std::vector<char>>; @@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) { static void printDepMatrix(CharMatrix &DepMatrix) { for (auto &Row : DepMatrix) { - for (auto D : Row) + // Drop the last element because it is a flag indicating whether this is + // forward dependency or not, which doesn't affect the legality check. + for (char D : drop_end(Row)) LLVM_DEBUG(dbgs() << D << " "); LLVM_DEBUG(dbgs() << "\n"); } } + +/// Return true if \p Src appears before \p Dst in the same basic block. +/// Precondition: \p Src and \Dst are distinct instructions within the same +/// basic block. +static bool inThisOrder(const Instruction *Src, const Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && Src != Dst && + "Expected Src and Dst to be different instructions in the same BB"); + + bool FoundSrc = false; + for (const Instruction &I : *(Src->getParent())) { + if (&I == Src) { + FoundSrc = true; + continue; + } + if (&I == Dst) + return FoundSrc; + } + + llvm_unreachable("Dst not found"); +} #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, @@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, return false; } ValueVector::iterator I, IE, J, JE; - StringSet<> Seen; + + // Manage direction vectors that are already seen. Map each direction vector + // to an index of DepMatrix at which it is stored. + StringMap<unsigned> Seen; for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { for (J = I, JE = MemInstr.end(); J != JE; ++J) { @@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Dep.push_back('I'); } + // Test whether the dependency is forward or not. + bool IsKnownForward = true; + if (Src->getParent() != Dst->getParent()) { + // In general, when Src and Dst are in different BBs, the execution + // order of them within a single iteration is not guaranteed. Treat + // conservatively as not-forward dependency in this case. + IsKnownForward = false; + } else { + // Src and Dst are in the same BB. If they are the different + // instructions, Src should appear before Dst in the BB as they are + // stored to MemInstr in that order. + assert((Src == Dst || inThisOrder(Src, Dst)) && + "Unexpected instructions"); + + // If the Dependence object is reversed (due to normalization), it + // represents the dependency from Dst to Src, meaning it is a backward + // dependency. Otherwise it should be a forward dependency. + bool IsReversed = D->getSrc() != Src; + if (IsReversed) + IsKnownForward = false; + } + + // Initialize the last element. Assume forward dependencies only; it + // will be updated later if there is any non-forward dependency. + Dep.push_back('<'); + + // The last element should express the "summary" among one or more + // direction vectors whose first N elements are the same (where N is + // the depth of the loop nest). Hence we exclude the last element from + // the Seen map. + auto [Ite, Inserted] = Seen.try_emplace( + StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size()); + // Make sure we only add unique entries to the dependency matrix. - if (Seen.insert(StringRef(Dep.data(), Dep.size())).second) + if (Inserted) DepMatrix.push_back(Dep); + + // If we cannot prove that this dependency is forward, change the last + // element of the corresponding entry. Since a `[... *]` dependency + // includes a `[... <]` dependency, we do not need to keep both and + // change the existing entry instead. + if (!IsKnownForward) + DepMatrix[Ite->second].back() = '*'; } } } @@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, continue; // Check if the direction vector is lexicographically positive (or zero) - // for both before/after exchanged. - if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) + // for both before/after exchanged. Ignore the last element because it + // doesn't affect the legality. + if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) return false; std::swap(Cur[InnerLoopId], Cur[OuterLoopId]); - if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false) + if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false) return false; } return true; @@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { for (const auto &Dep : DepMatrix) { char Dir = Dep[LoopId]; - if (Dir != 'I' && Dir != '=') - return false; + char DepType = Dep.back(); + assert((DepType == '<' || DepType == '*') && + "Unexpected element in dependency vector"); + + // There are no loop-carried dependencies. + if (Dir == '=' || Dir == 'I') + continue; + + // DepType being '<' means that this direction vector represents a forward + // dependency. In principle, a loop with '<' direction can be vectorized in + // this case. + if (Dir == '<' && DepType == '<') + continue; + + // We cannot prove that the loop is vectorizable. + return false; } return true; } std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // If the outer loop is not loop independent it is not profitable to move - // this to inner position, since doing so would not enable inner loop - // parallelism. + // If the outer loop cannot be vectorized, it is not profitable to move this + // to inner position. if (!canVectorize(DepMatrix, OuterLoopId)) return false; - // If inner loop has dependence and outer loop is loop independent then it is - // profitable to interchange to enable inner loop parallelism. + // If the inner loop cannot be vectorized but the outer loop can be, then it + // is profitable to interchange to enable inner loop parallelism. if (!canVectorize(DepMatrix, InnerLoopId)) return true; diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 9e318b0..e3ef9d8 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3785,7 +3785,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { // Ignore icmp instructions which are already being analyzed. if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { unsigned OtherIdx = !U.getOperandNo(); - Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); + Value *OtherOp = ICI->getOperand(OtherIdx); if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) continue; } diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 84d1c0b..9220abb 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1593,11 +1593,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, // since both llvm.lifetime.start and llvm.lifetime.end intrinsics // practically fill all the bytes of the alloca with an undefined // value, although conceptually marked as alive/dead. - int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue(); - if (Size < 0 || Size == DestSize) { - LifetimeMarkers.push_back(UI); - continue; - } + LifetimeMarkers.push_back(UI); + continue; } AAMetadataInstrs.insert(UI); @@ -1614,9 +1611,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, return true; }; - // Check that dest has no Mod/Ref, from the alloca to the Store, except full - // size lifetime intrinsics. And collect modref inst for the reachability - // check. + // Check that dest has no Mod/Ref, from the alloca to the Store. And collect + // modref inst for the reachability check. ModRefInfo DestModRef = ModRefInfo::NoModRef; MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size)); SmallVector<BasicBlock *, 8> ReachabilityWorklist; diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index ced61cb..aae5d60 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -458,8 +458,10 @@ bool ScalarizerVisitor::visit(Function &F) { Instruction *I = &*II; bool Done = InstVisitor::visit(I); ++II; - if (Done && I->getType()->isVoidTy()) + if (Done && I->getType()->isVoidTy()) { I->eraseFromParent(); + Scalarized = true; + } } } return finish(); diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 9b40fc0..f6959ca2 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -2144,9 +2144,23 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, bool CurrentLoopValid, bool PartiallyInvariant, bool InjectedCondition, ArrayRef<Loop *> NewLoops) { - // If we did a non-trivial unswitch, we have added new (cloned) loops. - if (!NewLoops.empty()) + auto RecordLoopAsUnswitched = [&](Loop *TargetLoop, StringRef Tag, + StringRef DisableTag) { + auto &Ctx = TargetLoop->getHeader()->getContext(); + MDNode *DisableMD = MDNode::get(Ctx, MDString::get(Ctx, DisableTag)); + MDNode *NewLoopID = makePostTransformationMetadata( + Ctx, TargetLoop->getLoopID(), {Tag}, {DisableMD}); + TargetLoop->setLoopID(NewLoopID); + }; + + // If we performed a non-trivial unswitch, we have added new cloned loops. + // Mark such newly-created loops as visited. + if (!NewLoops.empty()) { + for (Loop *NL : NewLoops) + RecordLoopAsUnswitched(NL, "llvm.loop.unswitch.nontrivial", + "llvm.loop.unswitch.nontrivial.disable"); U.addSiblingLoops(NewLoops); + } // If the current loop remains valid, we should revisit it to catch any // other unswitch opportunities. Otherwise, we need to mark it as deleted. @@ -2154,24 +2168,12 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, if (PartiallyInvariant) { // Mark the new loop as partially unswitched, to avoid unswitching on // the same condition again. - auto &Context = L.getHeader()->getContext(); - MDNode *DisableUnswitchMD = MDNode::get( - Context, - MDString::get(Context, "llvm.loop.unswitch.partial.disable")); - MDNode *NewLoopID = makePostTransformationMetadata( - Context, L.getLoopID(), {"llvm.loop.unswitch.partial"}, - {DisableUnswitchMD}); - L.setLoopID(NewLoopID); + RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.partial", + "llvm.loop.unswitch.partial.disable"); } else if (InjectedCondition) { // Do the same for injection of invariant conditions. - auto &Context = L.getHeader()->getContext(); - MDNode *DisableUnswitchMD = MDNode::get( - Context, - MDString::get(Context, "llvm.loop.unswitch.injection.disable")); - MDNode *NewLoopID = makePostTransformationMetadata( - Context, L.getLoopID(), {"llvm.loop.unswitch.injection"}, - {DisableUnswitchMD}); - L.setLoopID(NewLoopID); + RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.injection", + "llvm.loop.unswitch.injection.disable"); } else U.revisitCurrentLoop(); } else @@ -2809,9 +2811,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, } /// Cost multiplier is a way to limit potentially exponential behavior -/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch -/// candidates available. Also accounting for the number of "sibling" loops with -/// the idea to account for previous unswitches that already happened on this +/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch +/// candidates available. Also consider the number of "sibling" loops with +/// the idea of accounting for previous unswitches that already happened on this /// cluster of loops. There was an attempt to keep this formula simple, /// just enough to limit the worst case behavior. Even if it is not that simple /// now it is still not an attempt to provide a detailed heuristic size @@ -3507,8 +3509,9 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; IVConditionInfo PartialIVInfo; Instruction *PartialIVCondBranch = nullptr; - collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, - PartialIVCondBranch, L, LI, AA, MSSAU); + if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.nontrivial.disable")) + collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, LI, AA, MSSAU); if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable")) collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, PartialIVCondBranch, L, DT, LI, AA, diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index a69d649..44e63a0 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/RegionPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -128,6 +129,7 @@ struct PredInfo { using BBPredicates = DenseMap<BasicBlock *, PredInfo>; using PredMap = DenseMap<BasicBlock *, BBPredicates>; using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>; +using Val2BBMap = DenseMap<Value *, BasicBlock *>; // A traits type that is intended to be used in graph algorithms. The graph // traits starts at an entry node, and traverses the RegionNodes that are in @@ -279,7 +281,7 @@ class StructurizeCFG { ConstantInt *BoolTrue; ConstantInt *BoolFalse; Value *BoolPoison; - + const TargetTransformInfo *TTI; Function *Func; Region *ParentRegion; @@ -301,8 +303,12 @@ class StructurizeCFG { PredMap LoopPreds; BranchVector LoopConds; + Val2BBMap HoistedValues; + RegionNode *PrevNode; + void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB); + void orderNodes(); void analyzeLoops(RegionNode *N); @@ -332,6 +338,8 @@ class StructurizeCFG { void simplifyAffectedPhis(); + void simplifyHoistedPhis(); + DebugLoc killTerminator(BasicBlock *BB); void changeExit(RegionNode *Node, BasicBlock *NewExit, @@ -359,7 +367,7 @@ class StructurizeCFG { public: void init(Region *R); - bool run(Region *R, DominatorTree *DT); + bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI); bool makeUniformRegion(Region *R, UniformityInfo &UA); }; @@ -385,8 +393,11 @@ public: if (SCFG.makeUniformRegion(R, UA)) return false; } + Function *F = R->getEntry()->getParent(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F); DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - return SCFG.run(R, DT); + return SCFG.run(R, DT, TTI); } StringRef getPassName() const override { return "Structurize control flow"; } @@ -394,7 +405,9 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { if (SkipUniformRegions) AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); RegionPass::getAnalysisUsage(AU); @@ -403,6 +416,34 @@ public: } // end anonymous namespace +/// Checks whether an instruction is zero cost instruction and checks if the +/// operands are from different BB. If so, this instruction can be coalesced +/// if its hoisted to predecessor block. So, this returns true. +static bool isHoistableInstruction(Instruction *I, BasicBlock *BB, + const TargetTransformInfo *TTI) { + if (I->getParent() != BB || isa<PHINode>(I)) + return false; + + // If the instruction is not a zero cost instruction, return false. + auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); + InstructionCost::CostType CostVal = + Cost.isValid() + ? Cost.getValue() + : (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive; + if (CostVal != 0) + return false; + + // Check if any operands are instructions defined in the same block. + for (auto &Op : I->operands()) { + if (auto *OpI = dyn_cast<Instruction>(Op)) { + if (OpI->getParent() == BB) + return false; + } + } + + return true; +} + char StructurizeCFGLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg", @@ -413,6 +454,39 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg", "Structurize the CFG", false, false) +/// Structurization can introduce unnecessary VGPR copies due to register +/// coalescing interference. For example, if the Else block has a zero-cost +/// instruction and the Then block modifies the VGPR value, only one value is +/// live at a time in merge block before structurization. After structurization, +/// the coalescer may incorrectly treat the Then value as live in the Else block +/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies. +/// +/// This function examines phi nodes whose incoming values are zero-cost +/// instructions in the Else block. It identifies such values that can be safely +/// hoisted and moves them to the nearest common dominator of Then and Else +/// blocks. A follow-up function after setting PhiNodes assigns the hoisted +/// value to poison phi nodes along the if→flow edge, aiding register coalescing +/// and minimizing unnecessary live ranges. +void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, + BasicBlock *ThenBB) { + + BasicBlock *ElseSucc = ElseBB->getSingleSuccessor(); + BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB); + + if (!ElseSucc || !CommonDominator) + return; + Instruction *Term = CommonDominator->getTerminator(); + for (PHINode &Phi : ElseSucc->phis()) { + Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB); + auto *Inst = dyn_cast<Instruction>(ElseVal); + if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI)) + continue; + Inst->removeFromParent(); + Inst->insertInto(CommonDominator, Term->getIterator()); + HoistedValues[Inst] = CommonDominator; + } +} + /// Build up the general order of nodes, by performing a topological sort of the /// parent region's nodes, while ensuring that there is no outer cycle node /// between any two inner cycle nodes. @@ -535,7 +609,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { BasicBlock *Other = Term->getSuccessor(!i); if (Visited.count(Other) && !Loops.count(Other) && !Pred.count(Other) && !Pred.count(P)) { - + hoistZeroCostElseBlockPhiValues(Succ, Other); Pred[Other] = {BoolFalse, std::nullopt}; Pred[P] = {BoolTrue, std::nullopt}; continue; @@ -891,6 +965,44 @@ void StructurizeCFG::setPhiValues() { AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end()); } +/// Updates PHI nodes after hoisted zero cost instructions by replacing poison +/// entries on Flow nodes with the appropriate hoisted values +void StructurizeCFG::simplifyHoistedPhis() { + for (WeakVH VH : AffectedPhis) { + PHINode *Phi = dyn_cast_or_null<PHINode>(VH); + if (!Phi || Phi->getNumIncomingValues() != 2) + continue; + + for (int i = 0; i < 2; i++) { + Value *V = Phi->getIncomingValue(i); + auto BBIt = HoistedValues.find(V); + + if (BBIt == HoistedValues.end()) + continue; + + Value *OtherV = Phi->getIncomingValue(!i); + PHINode *OtherPhi = dyn_cast<PHINode>(OtherV); + if (!OtherPhi) + continue; + + int PoisonValBBIdx = -1; + for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) { + if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i))) + continue; + PoisonValBBIdx = i; + break; + } + if (PoisonValBBIdx == -1 || + !DT->dominates(BBIt->second, + OtherPhi->getIncomingBlock(PoisonValBBIdx))) + continue; + + OtherPhi->setIncomingValue(PoisonValBBIdx, V); + Phi->setIncomingValue(i, OtherV); + } + } +} + void StructurizeCFG::simplifyAffectedPhis() { bool Changed; do { @@ -1283,12 +1395,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { } /// Run the transformation for each region found -bool StructurizeCFG::run(Region *R, DominatorTree *DT) { +bool StructurizeCFG::run(Region *R, DominatorTree *DT, + const TargetTransformInfo *TTI) { if (R->isTopLevelRegion()) return false; this->DT = DT; - + this->TTI = TTI; Func = R->getEntry()->getParent(); assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); @@ -1300,6 +1413,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) { insertConditions(false); insertConditions(true); setPhiValues(); + simplifyHoistedPhis(); simplifyConditions(); simplifyAffectedPhis(); rebuildSSA(); @@ -1349,7 +1463,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F, bool Changed = false; DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); auto &RI = AM.getResult<RegionInfoAnalysis>(F); - + TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F); UniformityInfo *UI = nullptr; if (SkipUniformRegions) UI = &AM.getResult<UniformityInfoAnalysis>(F); @@ -1368,7 +1482,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F, continue; } - Changed |= SCFG.run(R, DT); + Changed |= SCFG.run(R, DT, TTI); } if (!Changed) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index f7e66ec..a4fa0e2 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -68,6 +68,7 @@ add_llvm_component_library(LLVMTransformUtils MoveAutoInit.cpp NameAnonGlobals.cpp PredicateInfo.cpp + ProfileVerify.cpp PromoteMemoryToRegister.cpp RelLookupTableConverter.cpp ScalarEvolutionExpander.cpp diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 4210ce6..291e2a5 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -22,7 +22,6 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassInstrumentation.h" #include "llvm/Pass.h" diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp index 8d18c75..a9e08ad 100644 --- a/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -41,7 +41,6 @@ #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PredIteratorCache.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index f89d36f..babd7f6 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -482,16 +482,11 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I, if (II->isLifetimeStartOrEnd()) { auto *Arg = II->getArgOperand(1); - // Lifetime intrinsics are dead when their right-hand is undef. - if (isa<UndefValue>(Arg)) - return true; - // If the right-hand is an alloc, global, or argument and the only uses - // are lifetime intrinsics then the intrinsics are dead. - if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg)) - return llvm::all_of(Arg->uses(), [](Use &Use) { - return isa<LifetimeIntrinsic>(Use.getUser()); - }); - return false; + // If the only uses of the alloca are lifetime intrinsics, then the + // intrinsics are dead. + return llvm::all_of(Arg->uses(), [](Use &Use) { + return isa<LifetimeIntrinsic>(Use.getUser()); + }); } // Assumptions are dead if their condition is trivially true. diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index ac413c9..de9deab 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -12,7 +12,6 @@ #include "llvm/Transforms/Utils/PredicateInfo.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AssumptionCache.h" diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp new file mode 100644 index 0000000..b972132 --- /dev/null +++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp @@ -0,0 +1,129 @@ +//===- ProfileVerify.cpp - Verify profile info for testing ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/ProfileVerify.h" +#include "llvm/ADT/DynamicAPInt.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Analysis.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/ProfDataUtils.h" +#include "llvm/Support/BranchProbability.h" + +using namespace llvm; +namespace { +class ProfileInjector { + Function &F; + FunctionAnalysisManager &FAM; + +public: + static const Instruction * + getTerminatorBenefitingFromMDProf(const BasicBlock &BB) { + if (succ_size(&BB) < 2) + return nullptr; + auto *Term = BB.getTerminator(); + return (isa<BranchInst>(Term) || isa<SwitchInst>(Term) || + isa<IndirectBrInst>(Term) || isa<CallBrInst>(Term)) + ? Term + : nullptr; + } + + static Instruction *getTerminatorBenefitingFromMDProf(BasicBlock &BB) { + return const_cast<Instruction *>( + getTerminatorBenefitingFromMDProf(const_cast<const BasicBlock &>(BB))); + } + + ProfileInjector(Function &F, FunctionAnalysisManager &FAM) : F(F), FAM(FAM) {} + bool inject(); +}; +} // namespace + +// FIXME: currently this injects only for terminators. Select isn't yet +// supported. +bool ProfileInjector::inject() { + // Get whatever branch probability info can be derived from the given IR - + // whether it has or not metadata. The main intention for this pass is to + // ensure that other passes don't drop or "forget" to update MD_prof. We do + // this as a mode in which lit tests would run. We want to avoid changing the + // behavior of those tests. A pass may use BPI (or BFI, which is computed from + // BPI). If no metadata is present, BPI is guesstimated by + // BranchProbabilityAnalysis. The injector (this pass) only persists whatever + // information the analysis provides, in other words, the pass being tested + // will get the same BPI it does if the injector wasn't running. + auto &BPI = FAM.getResult<BranchProbabilityAnalysis>(F); + + bool Changed = false; + for (auto &BB : F) { + auto *Term = getTerminatorBenefitingFromMDProf(BB); + if (!Term || Term->getMetadata(LLVMContext::MD_prof)) + continue; + SmallVector<BranchProbability> Probs; + Probs.reserve(Term->getNumSuccessors()); + for (auto I = 0U, E = Term->getNumSuccessors(); I < E; ++I) + Probs.emplace_back(BPI.getEdgeProbability(&BB, Term->getSuccessor(I))); + + assert(llvm::find_if(Probs, + [](const BranchProbability &P) { + return P.isUnknown(); + }) == Probs.end() && + "All branch probabilities should be valid"); + const auto *FirstZeroDenominator = + find_if(Probs, [](const BranchProbability &P) { + return P.getDenominator() == 0; + }); + (void)FirstZeroDenominator; + assert(FirstZeroDenominator == Probs.end()); + const auto *FirstNonZeroNumerator = + find_if(Probs, [](const BranchProbability &P) { return !P.isZero(); }); + assert(FirstNonZeroNumerator != Probs.end()); + DynamicAPInt LCM(Probs[0].getDenominator()); + DynamicAPInt GCD(FirstNonZeroNumerator->getNumerator()); + for (const auto &Prob : drop_begin(Probs)) { + if (!Prob.getNumerator()) + continue; + LCM = llvm::lcm(LCM, DynamicAPInt(Prob.getDenominator())); + GCD = llvm::gcd(GCD, DynamicAPInt(Prob.getNumerator())); + } + SmallVector<uint32_t> Weights; + Weights.reserve(Term->getNumSuccessors()); + for (const auto &Prob : Probs) { + DynamicAPInt W = + (Prob.getNumerator() * LCM / GCD) / Prob.getDenominator(); + Weights.emplace_back(static_cast<uint32_t>((int64_t)W)); + } + setBranchWeights(*Term, Weights, /*IsExpected=*/false); + Changed = true; + } + return Changed; +} + +PreservedAnalyses ProfileInjectorPass::run(Function &F, + FunctionAnalysisManager &FAM) { + ProfileInjector PI(F, FAM); + if (!PI.inject()) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +PreservedAnalyses ProfileVerifierPass::run(Function &F, + FunctionAnalysisManager &FAM) { + for (const auto &BB : F) + if (const auto *Term = + ProfileInjector::getTerminatorBenefitingFromMDProf(BB)) + if (!Term->getMetadata(LLVMContext::MD_prof)) + F.getContext().emitError("Profile verification failed"); + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index ed08c0b..ddb062b 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" @@ -42,6 +43,7 @@ cl::opt<unsigned> llvm::SCEVCheapExpansionBudget( "controls the budget that is considered cheap (default = 4)")); using namespace PatternMatch; +using namespace SCEVPatternMatch; PoisonFlags::PoisonFlags(const Instruction *I) { NUW = false; @@ -1224,6 +1226,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { } Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { + Type *STy = S->getType(); const Loop *L = S->getLoop(); BasicBlock *EB = L->getExitBlock(); if (!EB || !EB->getSinglePredecessor() || @@ -1231,11 +1234,36 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { return nullptr; for (auto &PN : EB->phis()) { - if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType()) + if (!SE.isSCEVable(PN.getType())) continue; - auto *ExitV = SE.getSCEV(&PN); - if (S == ExitV) - return &PN; + auto *ExitSCEV = SE.getSCEV(&PN); + if (!isa<SCEVAddRecExpr>(ExitSCEV)) + continue; + Type *PhiTy = PN.getType(); + if (STy->isIntegerTy() && PhiTy->isPointerTy()) + ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy); + else if (S->getType() != PN.getType()) + continue; + + // Check if we can re-use the existing PN, by adjusting it with an expanded + // offset, if the offset is simpler. + const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV); + const SCEV *Op = Diff; + match(Diff, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op))); + match(Op, m_scev_PtrToInt(m_SCEV(Op))); + if (!isa<SCEVConstant, SCEVUnknown>(Op)) + continue; + + assert(Diff->getType()->isIntegerTy() && + "difference must be of integer type"); + Value *DiffV = expand(Diff); + Value *BaseV = &PN; + if (PhiTy->isPointerTy()) { + if (STy->isPointerTy()) + return Builder.CreatePtrAdd(BaseV, DiffV); + BaseV = Builder.CreatePtrToInt(BaseV, DiffV->getType()); + } + return Builder.CreateAdd(BaseV, DiffV); } return nullptr; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 99a96a8..a53ccdd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2021,6 +2021,9 @@ public: /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR /// outside VPlan. std::pair<Value *, BasicBlock *> getMemRuntimeChecks() { + using namespace llvm::PatternMatch; + if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt())) + return {nullptr, nullptr}; return {MemRuntimeCheckCond, MemCheckBlock}; } @@ -7276,6 +7279,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); + VPlanTransforms::removeBranchOnConst(BestVPlan); VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); @@ -10095,9 +10099,20 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) + if (VF.Width.isVector() || SelectedIC > 1) { Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + // Bail out early if either the SCEV or memory runtime checks are known to + // fail. In that case, the vector loop would never execute. + using namespace llvm::PatternMatch; + if (Checks.getSCEVChecks().first && + match(Checks.getSCEVChecks().first, m_One())) + return false; + if (Checks.getMemRuntimeChecks().first && + match(Checks.getMemRuntimeChecks().first, m_One())) + return false; + } + // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; @@ -10228,6 +10243,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan); + // TODO: Move to general VPlan pipeline once epilogue loops are also + // supported. + VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount, + BestPlan, VF.Width, IC, PSE); + LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { @@ -10295,6 +10315,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.MinProfitableTripCount, IC, &CM, BFI, PSI, Checks, BestPlan); + // TODO: Move to general VPlan pipeline once epilogue loops are also + // supported. + VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount, + BestPlan, VF.Width, IC, PSE); + LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0d0b342..593868f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -206,6 +206,12 @@ static cl::opt<bool> VectorizeNonPowerOf2( "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements.")); +/// Enables vectorization of copyable elements. +static cl::opt<bool> VectorizeCopyableElements( + "slp-copyable-elements", cl::init(true), cl::Hidden, + cl::desc("Try to replace values with the idempotent instructions for " + "better vectorization.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -855,6 +861,13 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) { return *EI->idx_begin(); } +namespace llvm { +/// Checks if the specified value does not require scheduling. It does not +/// require scheduling if all operands and all users do not need to be scheduled +/// in the current basic block. +static bool doesNotNeedToBeScheduled(Value *V); +} // namespace llvm + namespace { /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. @@ -957,6 +970,33 @@ class BinOpSameOpcodeHelper { return Instruction::Xor; llvm_unreachable("Cannot find interchangeable instruction."); } + + /// Return true if the instruction can be converted to \p Opcode. + bool hasCandidateOpcode(unsigned Opcode) const { + MaskType Candidate = Mask & SeenBefore; + switch (Opcode) { + case Instruction::Shl: + return Candidate & ShlBIT; + case Instruction::AShr: + return Candidate & AShrBIT; + case Instruction::Mul: + return Candidate & MulBIT; + case Instruction::Add: + return Candidate & AddBIT; + case Instruction::Sub: + return Candidate & SubBIT; + case Instruction::And: + return Candidate & AndBIT; + case Instruction::Or: + return Candidate & OrBIT; + case Instruction::Xor: + return Candidate & XorBIT; + default: + break; + } + llvm_unreachable("Cannot find interchangeable instruction."); + } + SmallVector<Value *> getOperand(const Instruction *To) const { unsigned ToOpcode = To->getOpcode(); unsigned FromOpcode = I->getOpcode(); @@ -1117,6 +1157,10 @@ public: AltOp.trySet(OpcodeInMaskForm, InterchangeableMask)); } unsigned getMainOpcode() const { return MainOp.getOpcode(); } + /// Checks if the list of potential opcodes includes \p Opcode. + bool hasCandidateOpcode(unsigned Opcode) const { + return MainOp.hasCandidateOpcode(Opcode); + } bool hasAltOp() const { return AltOp.I; } unsigned getAltOpcode() const { return hasAltOp() ? AltOp.getOpcode() : getMainOpcode(); @@ -1152,6 +1196,8 @@ class InstructionsState { /// GetVectorCost. Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; + /// Wether the instruction state represents copyable instructions. + bool HasCopyables = false; public: Instruction *getMainOp() const { @@ -1190,9 +1236,11 @@ public: if (!I->isBinaryOp()) return nullptr; BinOpSameOpcodeHelper Converter(MainOp); - if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp()) - return MainOp; - return AltOp; + if (!Converter.add(I) || !Converter.add(MainOp)) + return nullptr; + if (Converter.hasAltOp() && !isAltShuffle()) + return nullptr; + return Converter.hasAltOp() ? AltOp : MainOp; } /// Checks if main/alt instructions are shift operations. @@ -1237,9 +1285,63 @@ public: explicit operator bool() const { return valid(); } InstructionsState() = delete; - InstructionsState(Instruction *MainOp, Instruction *AltOp) - : MainOp(MainOp), AltOp(AltOp) {} + InstructionsState(Instruction *MainOp, Instruction *AltOp, + bool HasCopyables = false) + : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {} static InstructionsState invalid() { return {nullptr, nullptr}; } + + bool isCopyableElement(Value *V) const { + assert(valid() && "InstructionsState is invalid."); + if (!HasCopyables) + return false; + if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr) + return false; + auto *I = dyn_cast<Instruction>(V); + if (!I) + return !isa<PoisonValue>(V); + if (I->getParent() != MainOp->getParent() && + (!isVectorLikeInstWithConstOps(I) || + !isVectorLikeInstWithConstOps(MainOp))) + return true; + if (I->getOpcode() == MainOp->getOpcode()) + return false; + if (!I->isBinaryOp()) + return true; + BinOpSameOpcodeHelper Converter(MainOp); + return !Converter.add(I) || !Converter.add(MainOp) || + Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode()); + } + + /// Checks if the value is non-schedulable. + bool isNonSchedulable(Value *V) const { + assert(valid() && "InstructionsState is invalid."); + auto *I = dyn_cast<Instruction>(V); + if (!HasCopyables) + return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) || + doesNotNeedToBeScheduled(V); + // MainOp for copyables always schedulable to correctly identify + // non-schedulable copyables. + if (isCopyableElement(V)) { + auto IsNonSchedulableCopyableElement = [this](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() || + (doesNotNeedToBeScheduled(I) && + // If the copyable instructions comes after MainOp + // (non-schedulable, but used in the block) - cannot vectorize + // it, will possibly generate use before def. + (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I))); + }; + + return IsNonSchedulableCopyableElement(V); + } + return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) || + doesNotNeedToBeScheduled(V); + } + + bool areInstructionsWithCopyableElements() const { + assert(valid() && "InstructionsState is invalid."); + return HasCopyables; + } }; std::pair<Instruction *, SmallVector<Value *>> @@ -1917,6 +2019,7 @@ public: CompressEntryToData.clear(); ExternalUses.clear(); ExternalUsesAsOriginalScalar.clear(); + ExternalUsesWithNonUsers.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -2899,9 +3002,6 @@ public: for (OperandDataVec &Ops : OpsVec) Ops.resize(NumLanes); for (unsigned Lane : seq<unsigned>(NumLanes)) { - Value *V = VL[Lane]; - assert((isa<Instruction>(V) || isa<PoisonValue>(V)) && - "Expected instruction or poison value"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of V and whether the operand at OpIdx is the LHS or RHS @@ -2912,17 +3012,24 @@ public: // Since operand reordering is performed on groups of commutative // operations or alternating sequences (e.g., +, -), we can safely tell // the inverse operations by checking commutativity. - if (isa<PoisonValue>(V)) { + auto *I = dyn_cast<Instruction>(VL[Lane]); + if (!I && isa<PoisonValue>(VL[Lane])) { for (unsigned OpIdx : seq<unsigned>(NumOperands)) OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false}; continue; } - auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S); - // We cannot check commutativity by the converted instruction - // (SelectedOp) because isCommutative also examines def-use - // relationships. - bool IsInverseOperation = - !isCommutative(SelectedOp, cast<Instruction>(V)); + bool IsInverseOperation = false; + if (S.isCopyableElement(VL[Lane])) { + // The value is a copyable element. + IsInverseOperation = !isCommutative(MainOp); + } else { + assert(I && "Expected instruction"); + auto [SelectedOp, Ops] = convertTo(I, S); + // We cannot check commutativity by the converted instruction + // (SelectedOp) because isCommutative also examines def-use + // relationships. + IsInverseOperation = !isCommutative(SelectedOp, I); + } for (unsigned OpIdx : seq<unsigned>(ArgSize)) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false}; @@ -3792,6 +3899,9 @@ private: /// reordering of operands during buildTreeRec() and vectorizeTree(). SmallVector<ValueList, 2> Operands; + /// Copyable elements of the entry node. + SmallPtrSet<const Value *, 4> CopyableElements; + /// MainOp and AltOp are recorded inside. S should be obtained from /// newTreeEntry. InstructionsState S = InstructionsState::invalid(); @@ -3820,11 +3930,7 @@ private: void setInterleave(unsigned Factor) { InterleaveFactor = Factor; } /// Marks the node as one that does not require scheduling. - void setDoesNotNeedToSchedule() { - assert(::doesNotNeedToSchedule(Scalars) && - "Expected to not need scheduling"); - DoesNotNeedToSchedule = true; - } + void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; } /// Returns true if the node is marked as one that does not require /// scheduling. bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; } @@ -3896,6 +4002,20 @@ private: bool hasState() const { return S.valid(); } + /// Add \p V to the list of copyable elements. + void addCopyableElement(Value *V) { + assert(S.isCopyableElement(V) && "Not a copyable element."); + CopyableElements.insert(V); + } + + /// Returns true if \p V is a copyable element. + bool isCopyableElement(Value *V) const { + return CopyableElements.contains(V); + } + + /// Returns true if any scalar in the list is a copyable element. + bool hasCopyableElements() const { return !CopyableElements.empty(); } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. unsigned findLaneForValue(Value *V) const { @@ -3968,6 +4088,8 @@ private: for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; dbgs() << "State: "; + if (S && hasCopyableElements()) + dbgs() << "[[Copyable]] "; switch (State) { case Vectorize: if (InterleaveFactor > 0) { @@ -4145,12 +4267,20 @@ private: } } } else if (!Last->isGather()) { - if (doesNotNeedToSchedule(VL)) + if (isa<PHINode>(S.getMainOp()) || + isVectorLikeInstWithConstOps(S.getMainOp()) || + (!S.areInstructionsWithCopyableElements() && + doesNotNeedToSchedule(VL)) || + all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) Last->setDoesNotNeedToSchedule(); SmallPtrSet<Value *, 4> Processed; for (Value *V : VL) { if (isa<PoisonValue>(V)) continue; + if (S.isCopyableElement(V)) { + Last->addCopyableElement(V); + continue; + } auto It = ScalarToTreeEntries.find(V); if (It == ScalarToTreeEntries.end()) { ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last); @@ -4162,16 +4292,14 @@ private: } } // Update the scheduler bundle to point to this TreeEntry. - assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) || - isVectorLikeInstWithConstOps(S.getMainOp()) || - Last->doesNotNeedToSchedule()) && + assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) && "Bundle and VL out of sync"); if (!Bundle.getBundle().empty()) { #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) auto *BundleMember = Bundle.getBundle().begin(); SmallPtrSet<Value *, 4> Processed; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second) + if (S.isNonSchedulable(V) || !Processed.insert(V).second) continue; ++BundleMember; } @@ -4280,7 +4408,8 @@ private: /// in general. ScalarsVectorizationLegality getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const; + const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const; /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. @@ -4420,6 +4549,10 @@ private: /// extractelement instructions. SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar; + /// A list of scalar to be extracted without specific user necause of too many + /// uses. + SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers; + /// Values used only by @llvm.assume calls. SmallPtrSet<const Value *, 32> EphValues; @@ -4996,7 +5129,8 @@ private: /// Build a bundle from the ScheduleData nodes corresponding to the /// scalar instruction for each lane. - ScheduleBundle &buildBundle(ArrayRef<Value *> VL); + ScheduleBundle &buildBundle(ArrayRef<Value *> VL, + const InstructionsState &S); /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are @@ -6727,7 +6861,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::move(ResOrder); } if (TE.State == TreeEntry::StridedVectorize && !TopToBottom && - (!TE.UserTreeIndex || + (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() || !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) && (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices))) return std::nullopt; @@ -7038,10 +7172,11 @@ bool BoUpSLP::isProfitableToReorder() const { VectorizableTree.front()->getOpcode() == Instruction::ICmp))) && VectorizableTree.front()->ReorderIndices.empty()) { // Check if the tree has only single store and single (unordered) load node, - // other nodes are phis or geps/binops, combined with phis, and/orsingle + // other nodes are phis or geps/binops, combined with phis, and/or single // gather load node bool HasPhis = false; - if (VectorizableTree.front()->getOpcode() == Instruction::PHI && + if (VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::PHI && VectorizableTree.front()->Scalars.size() == TinyVF && VectorizableTree.front()->getNumOperands() > PhiOpsLimit) return false; @@ -7049,6 +7184,8 @@ bool BoUpSLP::isProfitableToReorder() const { unsigned GatherLoads = 0; for (const std::unique_ptr<TreeEntry> &TE : ArrayRef(VectorizableTree).drop_front()) { + if (TE->State == TreeEntry::SplitVectorize) + continue; if (!TE->hasState()) { if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) || all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>)) @@ -7072,7 +7209,10 @@ bool BoUpSLP::isProfitableToReorder() const { if (TE->getOpcode() == Instruction::GetElementPtr || Instruction::isBinaryOp(TE->getOpcode())) continue; - if (TE->getOpcode() != Instruction::PHI) + if (TE->getOpcode() != Instruction::PHI && + (!TE->hasCopyableElements() || + static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) < + TE->Scalars.size() / 2)) return true; if (VectorizableTree.front()->Scalars.size() == TinyVF && TE->getNumOperands() > PhiOpsLimit) @@ -7860,7 +8000,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const { - if ((Entry.getOpcode() == Instruction::Store || + if (Entry.hasState() && + (Entry.getOpcode() == Instruction::Store || Entry.getOpcode() == Instruction::Load) && Entry.State == TreeEntry::StridedVectorize && !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices)) @@ -7870,7 +8011,9 @@ Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const { void BoUpSLP::buildExternalUses( const ExtraValueToDebugLocsMap &ExternallyUsedValues) { + const size_t NumVectScalars = ScalarToTreeEntries.size() + 1; DenseMap<Value *, unsigned> ScalarToExtUses; + SmallPtrSet<Value *, 4> ExternalUsers; // Collect the values that we need to extract from the tree. for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -7882,13 +8025,24 @@ void BoUpSLP::buildExternalUses( // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - if (!isa<Instruction>(Scalar)) + if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar)) continue; + // All uses must be replaced already? No need to do it again. auto It = ScalarToExtUses.find(Scalar); if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User) continue; + if (Scalar->hasNUsesOrMore(NumVectScalars)) { + unsigned FoundLane = Entry->findLaneForValue(Scalar); + LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane + << " from " << *Scalar << "for many users.\n"); + It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first; + ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane); + ExternalUsesWithNonUsers.insert(Scalar); + continue; + } + // Check if the scalar is externally used as an extra arg. const auto ExtI = ExternallyUsedValues.find(Scalar); if (ExtI != ExternallyUsedValues.end()) { @@ -7916,7 +8070,10 @@ void BoUpSLP::buildExternalUses( // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in FoundLane will // be used. - if (all_of(UseEntries, [&](TreeEntry *UseEntry) { + if (!((Scalar->getType()->getScalarType()->isPointerTy() && + isa<LoadInst, StoreInst>(UserInst)) || + isa<CallInst>(UserInst)) || + all_of(UseEntries, [&](TreeEntry *UseEntry) { return UseEntry->State == TreeEntry::ScatterVectorize || !doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, @@ -7946,6 +8103,7 @@ void BoUpSLP::buildExternalUses( << ".\n"); It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first; ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane); + ExternalUsesWithNonUsers.insert(Scalar); if (!U) break; } @@ -9612,7 +9770,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL, PoisonValue::get(UniqueValues.front()->getType())); // Check that extended with poisons operations are still valid for // vectorization (div/rem are not allowed). - if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) { + if (!S.areInstructionsWithCopyableElements() && + !getSameOpcode(PaddedUniqueValues, TLI).valid()) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); ReuseShuffleIndices.clear(); return false; @@ -9761,13 +9920,95 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL, } namespace { -/// Class accepts incoming list of values and generates the list of values -/// for scheduling and list of operands for the new nodes. +/// Class accepts incoming list of values, checks if it is able to model +/// "copyable" values as compatible operations, and generates the list of values +/// for scheduling and list of operands doe the new nodes. class InstructionsCompatibilityAnalysis { DominatorTree &DT; const DataLayout &DL; const TargetTransformInfo &TTI; const TargetLibraryInfo &TLI; + unsigned MainOpcode = 0; + Instruction *MainOp = nullptr; + + /// Identifies the best candidate value, which represents main opcode + /// operation. + /// Currently the best candidate is the Add instruction with the parent + /// block with the highest DFS incoming number (block, that dominates other). + void findAndSetMainInstruction(ArrayRef<Value *> VL) { + BasicBlock *Parent = nullptr; + // Checks if the instruction has supported opcode. + auto IsSupportedOpcode = [](Instruction *I) { + return I && I->getOpcode() == Instruction::Add; + }; + SmallDenseSet<Value *, 8> Operands; + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); + if (!I) + continue; + if (!DT.isReachableFromEntry(I->getParent())) + continue; + if (!MainOp) { + MainOp = I; + Parent = I->getParent(); + Operands.insert(I->op_begin(), I->op_end()); + continue; + } + if (Parent == I->getParent()) { + if (!IsSupportedOpcode(MainOp)) + MainOp = I; + if (MainOp->getOpcode() == I->getOpcode() && + doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I)) + MainOp = I; + Operands.insert(I->op_begin(), I->op_end()); + continue; + } + auto *NodeA = DT.getNode(Parent); + auto *NodeB = DT.getNode(I->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == + (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) { + MainOp = I; + Parent = I->getParent(); + Operands.clear(); + Operands.insert(I->op_begin(), I->op_end()); + } + } + if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) { + MainOp = nullptr; + return; + } + MainOpcode = MainOp->getOpcode(); + } + + /// Returns the idempotent value for the \p MainOp with the detected \p + /// MainOpcode. For Add, returns 0. For Or, it should choose between false and + /// the operand itself, since V or V == V. + Value *selectBestIdempotentValue() const { + assert(MainOpcode == Instruction::Add && "Unsupported opcode"); + return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(), + !MainOp->isCommutative()); + } + + /// Returns the value and operands for the \p V, considering if it is original + /// instruction and its actual operands should be returned, or it is a + /// copyable element and its should be represented as idempotent instruction. + SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const { + if (isa<PoisonValue>(V)) + return {V, V}; + if (!S.isCopyableElement(V)) + return convertTo(cast<Instruction>(V), S).second; + switch (MainOpcode) { + case Instruction::Add: + return {V, selectBestIdempotentValue()}; + default: + break; + } + llvm_unreachable("Unsupported opcode"); + } /// Builds operands for the original instructions. void @@ -9928,22 +10169,165 @@ public: const TargetLibraryInfo &TLI) : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {} + InstructionsState + buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R, + bool TryCopyableElementsVectorization, + bool WithProfitabilityCheck = false, + bool SkipSameCodeCheck = false) { + InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL)) + ? InstructionsState::invalid() + : getSameOpcode(VL, TLI); + if (S) + return S; + if (!VectorizeCopyableElements || !TryCopyableElementsVectorization) + return S; + findAndSetMainInstruction(VL); + if (!MainOp) + return InstructionsState::invalid(); + S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true); + // TODO: Remove this check once support for schulable copyables is landed. + if (any_of(VL, [&](Value *V) { + return S.isCopyableElement(V) && !S.isNonSchedulable(V); + })) + return InstructionsState::invalid(); + + if (!WithProfitabilityCheck) + return S; + // Check if it is profitable to vectorize the instruction. + SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL); + auto BuildCandidates = + [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1, + Value *V2) { + if (V1 != V2 && isa<PHINode>(V1)) + return; + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (I1 && I2 && I1->getOpcode() == I2->getOpcode() && + I1->getParent() != I2->getParent()) + return; + Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1); + }; + if (VL.size() == 2) { + // Check if the operands allow better vectorization. + SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2; + BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]); + BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]); + bool Res = !Candidates1.empty() && !Candidates2.empty() && + R.findBestRootPair(Candidates1) && + R.findBestRootPair(Candidates2); + if (!Res && isCommutative(MainOp)) { + Candidates1.clear(); + Candidates2.clear(); + BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]); + BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]); + Res = !Candidates1.empty() && !Candidates2.empty() && + R.findBestRootPair(Candidates1) && + R.findBestRootPair(Candidates2); + } + if (!Res) + return InstructionsState::invalid(); + return S; + } + assert(Operands.size() == 2 && "Unexpected number of operands!"); + unsigned CopyableNum = + count_if(VL, [&](Value *V) { return S.isCopyableElement(V); }); + if (CopyableNum < VL.size() / 2) + return S; + // Too many phi copyables - exit. + const unsigned Limit = VL.size() / 24; + if ((CopyableNum >= VL.size() - Limit || + (CopyableNum >= VL.size() - 1 && VL.size() > 4) || + CopyableNum >= MaxPHINumOperands) && + all_of(VL, [&](Value *V) { + return isa<PHINode>(V) || !S.isCopyableElement(V); + })) + return InstructionsState::invalid(); + // Check profitability if number of copyables > VL.size() / 2. + // 1. Reorder operands for better matching. + if (isCommutative(MainOp)) { + for (auto &Ops : Operands) { + // Make instructions the first operands. + if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + // Make constants the second operands. + if (isa<Constant>(Ops.front())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + } + } + // 2. Check, if operands can be vectorized. + if (count_if(Operands.back(), IsaPred<Instruction>) > 1) + return InstructionsState::invalid(); + auto CheckOperand = [&](ArrayRef<Value *> Ops) { + if (allConstant(Ops) || isSplat(Ops)) + return true; + // Check if it is "almost" splat, i.e. has >= 4 elements and only single + // one is different. + constexpr unsigned Limit = 4; + if (Operands.front().size() >= Limit) { + SmallDenseMap<const Value *, unsigned> Counters; + for (Value *V : Ops) { + if (isa<UndefValue>(V)) + continue; + ++Counters[V]; + } + if (Counters.size() == 2 && + any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) { + return C.second == 1; + })) + return true; + } + // First operand not a constant or splat? Last attempt - check for + // potential vectorization. + InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI); + InstructionsState OpS = Analysis.buildInstructionsState( + Ops, R, /*TryCopyableElementsVectorization=*/true); + if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops))) + return false; + unsigned CopyableNum = + count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); }); + return CopyableNum <= VL.size() / 2; + }; + if (!CheckOperand(Operands.front())) + return InstructionsState::invalid(); + + return S; + } + SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S, ArrayRef<Value *> VL) { assert(S && "Invalid state!"); SmallVector<BoUpSLP::ValueList> Operands; - buildOriginalOperands(S, VL, Operands); + if (S.areInstructionsWithCopyableElements()) { + MainOp = S.getMainOp(); + MainOpcode = S.getOpcode(); + Operands.assign(MainOp->getNumOperands(), + BoUpSLP::ValueList(VL.size(), nullptr)); + for (auto [Idx, V] : enumerate(VL)) { + SmallVector<Value *> OperandsForValue = getOperands(S, V); + for (auto [OperandIdx, Operand] : enumerate(OperandsForValue)) + Operands[OperandIdx][Idx] = Operand; + } + } else { + buildOriginalOperands(S, VL, Operands); + } return Operands; } }; } // namespace -BoUpSLP::ScalarsVectorizationLegality -BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const { +BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( + ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); - InstructionsState S = getSameOpcode(VL, *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + VL, *this, TryCopyableElementsVectorization, + /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization); // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no @@ -10066,7 +10450,7 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth, bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; - bool AreAllSameBlock = S && allSameBlock(VL); + bool AreAllSameBlock = S.valid(); bool AreScatterAllGEPSameBlock = (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() && VL.size() > 2 && @@ -10091,12 +10475,18 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth, NotProfitableForVectorization(VL)) { if (!S) { LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to " - "C,S,B,O, small shuffle. \n"); + "C,S,B,O, small shuffle. \n"; + dbgs() << "["; + interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; }); + dbgs() << "]\n"); return ScalarsVectorizationLegality(S, /*IsLegal=*/false, /*TryToFindDuplicates=*/true, /*TrySplitVectorize=*/true); } - LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"; + dbgs() << "["; + interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; }); + dbgs() << "]\n"); return ScalarsVectorizationLegality(S, /*IsLegal=*/false); } @@ -10242,9 +10632,29 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth, return true; }; - ScalarsVectorizationLegality Legality = - getScalarsVectorizationLegality(VL, Depth, UserTreeIdx); - const InstructionsState &S = Legality.getInstructionsState(); + auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) { + bool AreConsts = false; + for (Value *V : VL) { + if (isa<PoisonValue>(V)) + continue; + if (isa<Constant>(V)) { + AreConsts = true; + continue; + } + if (!isa<PHINode>(V)) + return false; + } + return AreConsts; + }; + if (AreOnlyConstsWithPHIs(VL)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n"); + newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx); + return; + } + + ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false); + InstructionsState S = Legality.getInstructionsState(); if (!Legality.isLegal()) { if (Legality.trySplitVectorize()) { auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL); @@ -10252,11 +10662,18 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth, if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp))) return; } - if (Legality.tryToFindDuplicates()) - tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx); + if (!S) + Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true); + if (!Legality.isLegal()) { + if (Legality.tryToFindDuplicates()) + tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, + UserTreeIdx); - newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); - return; + newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); + return; + } + S = Legality.getInstructionsState(); } // FIXME: investigate if there are profitable cases for VL.size() <= 4. @@ -13024,7 +13441,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && - E->getMainOp()->getType()->isPointerTy())) && + E->getMainOp()->getType()->isPointerTy()) || + E->hasCopyableElements()) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -13036,6 +13454,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, SmallBitVector UsedScalars(Sz, false); for (unsigned I = 0; I < Sz; ++I) { if (isa<Instruction>(UniqueValues[I]) && + !E->isCopyableElement(UniqueValues[I]) && getTreeEntries(UniqueValues[I]).front() == E) continue; UsedScalars.set(I); @@ -14075,15 +14494,45 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { // If the tree contains only phis, buildvectors, split nodes and // small nodes with reuses, we can skip it. + SmallVector<const TreeEntry *> StoreLoadNodes; + unsigned NumGathers = 0; + constexpr int LimitTreeSize = 36; if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && - all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) { - return TE->State == TreeEntry::SplitVectorize || - (TE->isGather() && - none_of(TE->Scalars, IsaPred<ExtractElementInst>)) || - (TE->hasState() && (TE->getOpcode() == Instruction::PHI || - (!TE->ReuseShuffleIndices.empty() && - TE->Scalars.size() == 2))); - })) + all_of(VectorizableTree, + [&](const std::unique_ptr<TreeEntry> &TE) { + if (!TE->isGather() && TE->hasState() && + (TE->getOpcode() == Instruction::Load || + TE->getOpcode() == Instruction::Store)) { + StoreLoadNodes.push_back(TE.get()); + return true; + } + if (TE->isGather()) + ++NumGathers; + return TE->State == TreeEntry::SplitVectorize || + (TE->Idx == 0 && TE->Scalars.size() == 2 && + TE->hasState() && TE->getOpcode() == Instruction::ICmp && + VectorizableTree.size() > LimitTreeSize) || + (TE->isGather() && + none_of(TE->Scalars, IsaPred<ExtractElementInst>)) || + (TE->hasState() && + (TE->getOpcode() == Instruction::PHI || + (TE->hasCopyableElements() && + static_cast<unsigned>(count_if( + TE->Scalars, IsaPred<PHINode, Constant>)) >= + TE->Scalars.size() / 2) || + ((!TE->ReuseShuffleIndices.empty() || + !TE->ReorderIndices.empty() || TE->isAltShuffle()) && + TE->Scalars.size() == 2))); + }) && + (StoreLoadNodes.empty() || + (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() && + (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) { + return TE->getOpcode() == Instruction::Store || + all_of(TE->Scalars, [&](Value *V) { + return !isa<LoadInst>(V) || + areAllUsersVectorized(cast<Instruction>(V)); + }); + }))))) return true; // We can vectorize the tree if its size is greater than or equal to the @@ -14826,6 +15275,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals, bool IsProfitablePHIUser = (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic && VectorizableTree.front()->Scalars.size() > 2)) && + VectorizableTree.front()->hasState() && VectorizableTree.front()->getOpcode() == Instruction::PHI && !Inst->hasNUsesOrMore(UsesLimit) && none_of(Inst->users(), @@ -15276,7 +15726,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( const BasicBlock *TEInsertBlock = nullptr; // Main node of PHI entries keeps the correct order of operands/incoming // blocks. - if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp()); + if (auto *PHI = dyn_cast_or_null<PHINode>( + TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr); PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) { TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx); TEInsertPt = TEInsertBlock->getTerminator(); @@ -15375,7 +15826,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( "Expected only single user of a gather node."); const EdgeInfo &UseEI = TEPtr->UserTreeIndex; - PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize + PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize && + UseEI.UserTE->hasState()) ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp()) : nullptr; Instruction *InsertPt = @@ -15388,7 +15840,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( TEUseEI.UserTE->isAltShuffle()) && all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) { if (UseEI.UserTE->State != TreeEntry::Vectorize || - (UseEI.UserTE->getOpcode() == Instruction::PHI && + (UseEI.UserTE->hasState() && + UseEI.UserTE->getOpcode() == Instruction::PHI && !UseEI.UserTE->isAltShuffle()) || !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock)) continue; @@ -16009,25 +16462,32 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { Instruction *Res = nullptr; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block (except for extractelement-like instructions with - // constant indices or gathered loads). - auto *Front = E->getMainOp(); + // constant indices or gathered loads or copyables). + Instruction *Front; + unsigned Opcode; + if (E->hasState()) { + Front = E->getMainOp(); + Opcode = E->getOpcode(); + } else { + Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>)); + Opcode = Front->getOpcode(); + } auto *BB = Front->getParent(); - assert(((GatheredLoadsEntriesFirst.has_value() && - E->getOpcode() == Instruction::Load && E->isGather() && - E->Idx < *GatheredLoadsEntriesFirst) || - E->State == TreeEntry::SplitVectorize || - all_of(E->Scalars, - [=](Value *V) -> bool { - if (E->getOpcode() == Instruction::GetElementPtr && - !isa<GetElementPtrInst>(V)) - return true; - auto *I = dyn_cast<Instruction>(V); - return !I || !E->getMatchingMainOpOrAltOp(I) || - I->getParent() == BB || - isVectorLikeInstWithConstOps(I); - })) && - "Expected gathered loads or GEPs or instructions from same basic " - "block."); + assert( + ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load && + E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) || + E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() || + all_of(E->Scalars, + [=](Value *V) -> bool { + if (Opcode == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(V)) + return true; + auto *I = dyn_cast<Instruction>(V); + return !I || !E->getMatchingMainOpOrAltOp(I) || + I->getParent() == BB || isVectorLikeInstWithConstOps(I); + })) && + "Expected gathered loads or GEPs or instructions from same basic " + "block."); auto FindLastInst = [&]() { Instruction *LastInst = Front; @@ -16035,18 +16495,20 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { auto *I = dyn_cast<Instruction>(V); if (!I) continue; + if (E->isCopyableElement(I)) + continue; if (LastInst->getParent() == I->getParent()) { if (LastInst->comesBefore(I)) LastInst = I; continue; } - assert(((E->getOpcode() == Instruction::GetElementPtr && + assert(((Opcode == Instruction::GetElementPtr && !isa<GetElementPtrInst>(I)) || E->State == TreeEntry::SplitVectorize || (isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I)) || (GatheredLoadsEntriesFirst.has_value() && - E->getOpcode() == Instruction::Load && E->isGather() && + Opcode == Instruction::Load && E->isGather() && E->Idx < *GatheredLoadsEntriesFirst)) && "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(LastInst->getParent())) { @@ -16075,16 +16537,18 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { auto *I = dyn_cast<Instruction>(V); if (!I) continue; + if (E->isCopyableElement(I)) + continue; if (FirstInst->getParent() == I->getParent()) { if (I->comesBefore(FirstInst)) FirstInst = I; continue; } - assert(((E->getOpcode() == Instruction::GetElementPtr && - !isa<GetElementPtrInst>(I)) || - (isVectorLikeInstWithConstOps(FirstInst) && - isVectorLikeInstWithConstOps(I))) && - "Expected vector-like or non-GEP in GEP node insts only."); + assert(((Opcode == Instruction::GetElementPtr && + !isa<GetElementPtrInst>(I)) || + (isVectorLikeInstWithConstOps(FirstInst) && + isVectorLikeInstWithConstOps(I))) && + "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(FirstInst->getParent())) { FirstInst = I; continue; @@ -16122,7 +16586,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // Set insertpoint for gathered loads to the very first load. if (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && - E->getOpcode() == Instruction::Load) { + Opcode == Instruction::Load) { Res = FindFirstInst(); EntryToLastInstruction.try_emplace(E, Res); return *Res; @@ -16139,7 +16603,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return nullptr; for (Value *V : E->Scalars) { auto *I = dyn_cast<Instruction>(V); - if (!I || isa<PHINode>(I) || doesNotNeedToBeScheduled(I)) + if (!I || isa<PHINode>(I) || + (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I))) continue; ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I); if (Bundles.empty()) @@ -16153,13 +16618,13 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { }; const ScheduleBundle *Bundle = FindScheduleBundle(E); if (!E->isGather() && !Bundle) { - if ((E->getOpcode() == Instruction::GetElementPtr && + if ((Opcode == Instruction::GetElementPtr && any_of(E->Scalars, [](Value *V) { return !isa<GetElementPtrInst>(V) && isa<Instruction>(V); })) || - all_of(E->Scalars, [](Value *V) { - return isa<PoisonValue>(V) || + all_of(E->Scalars, [&](Value *V) { + return isa<PoisonValue>(V) || E->isCopyableElement(V) || (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V)); })) Res = FindLastInst(); @@ -18640,6 +19105,7 @@ Value *BoUpSLP::vectorizeTree( TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI || TE->UserTreeIndex.UserTE->isAltShuffle()) && + !TE->UserTreeIndex.UserTE->hasCopyableElements() && all_of(TE->UserTreeIndex.UserTE->Scalars, [](Value *V) { return isUsedOutsideBlock(V); })) { Instruction &LastInst = @@ -18903,7 +19369,7 @@ Value *BoUpSLP::vectorizeTree( continue; assert( (ExternallyUsedValues.count(Scalar) || - Scalar->hasNUsesOrMore(UsesLimit) || + ExternalUsesWithNonUsers.count(Scalar) || ExternalUsesAsOriginalScalar.contains(Scalar) || any_of( Scalar->users(), @@ -19182,7 +19648,7 @@ Value *BoUpSLP::vectorizeTree( if (auto *EE = dyn_cast<ExtractElementInst>(Scalar); EE && IgnoredExtracts.contains(EE)) continue; - if (isa<PoisonValue>(Scalar)) + if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar)) continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); @@ -19424,12 +19890,15 @@ void BoUpSLP::optimizeGatherSequence() { } BoUpSLP::ScheduleBundle & -BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { +BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL, + const InstructionsState &S) { auto &BundlePtr = ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>()); for (Value *V : VL) { if (doesNotNeedToBeScheduled(V)) continue; + if (S.isCopyableElement(V)) + continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " "(maybe not in same basic block)"); @@ -19450,10 +19919,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, const InstructionsState &S) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. + bool HasCopyables = S.areInstructionsWithCopyableElements(); if (isa<PHINode>(S.getMainOp()) || - isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL)) + isVectorLikeInstWithConstOps(S.getMainOp()) || + (!HasCopyables && doesNotNeedToSchedule(VL)) || + all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) return nullptr; + // TODO Remove once full support for copyables is landed. + assert(all_of(VL, + [&](Value *V) { + return !S.isCopyableElement(V) || S.isNonSchedulable(V); + }) && + "Copyable elements should not be schedulable"); // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n"); @@ -19499,7 +19977,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it @@ -19516,7 +19994,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && @@ -19541,7 +20019,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, ReSchedule = true; } - ScheduleBundle &Bundle = buildBundle(VL); + ScheduleBundle &Bundle = buildBundle(VL, S); TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle.isReady()) { for (ScheduleData *BD : Bundle.getBundle()) { @@ -19558,7 +20036,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, } ScheduledBundlesList.pop_back(); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back(); } @@ -20187,7 +20665,7 @@ bool BoUpSLP::collectValuesToDemote( }; if (E.isGather() || !Visited.insert(&E).second || any_of(E.Scalars, [&](Value *V) { - return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) { + return !isa<Constant>(V) && all_of(V->users(), [&](User *U) { return isa<InsertElementInst>(U) && !isVectorized(U); }); })) @@ -20555,9 +21033,10 @@ void BoUpSLP::computeMinimumValueSizes() { if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode, SelectInst>(U) || isa<SIToFPInst, UIToFPInst>(U) || - !isa<CastInst, BinaryOperator, FreezeInst, PHINode, - SelectInst>(UserTE->getMainOp()) || - isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp())) + (UserTE->hasState() && + (!isa<CastInst, BinaryOperator, FreezeInst, PHINode, + SelectInst>(UserTE->getMainOp()) || + isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp())))) return true; unsigned UserTESz = DL->getTypeSizeInBits( UserTE->Scalars.front()->getType()); @@ -20653,7 +21132,12 @@ void BoUpSLP::computeMinimumValueSizes() { if (!IsKnownPositive) ++BitWidth1; - APInt Mask = DB->getDemandedBits(cast<Instruction>(Root)); + auto *I = dyn_cast<Instruction>(Root); + if (!I) { + MaxBitWidth = std::max(BitWidth1, MaxBitWidth); + continue; + } + APInt Mask = DB->getDemandedBits(I); unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); MaxBitWidth = std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth); @@ -20802,6 +21286,7 @@ void BoUpSLP::computeMinimumValueSizes() { NodeIdx < VectorizableTree.size() && VectorizableTree[NodeIdx]->UserTreeIndex && VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 && + VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() && VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() == Instruction::Trunc && !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle(); @@ -20982,7 +21467,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, for (Value *V : Chain) ValOps.insert(cast<StoreInst>(V)->getValueOperand()); // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. - InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true); if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) { DenseSet<Value *> Stores(Chain.begin(), Chain.end()); bool IsAllowedSize = diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 40a5565..25b9616 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -962,7 +962,11 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, BackedgeTakenCount->setUnderlyingValue(TCMO); } - VectorTripCount.setUnderlyingValue(VectorTripCountV); + if (!VectorTripCount.getUnderlyingValue()) + VectorTripCount.setUnderlyingValue(VectorTripCountV); + else + assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV && + "VectorTripCount set earlier must much VectorTripCountV"); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); // FIXME: Model VF * UF computation completely in VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index db40ce2..6655149 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -906,10 +906,10 @@ template <unsigned PartOpIdx> class LLVM_ABI_FOR_TEST VPUnrollPartAccessor { protected: /// Return the VPValue operand containing the unroll part or null if there is /// no such operand. - VPValue *getUnrollPartOperand(VPUser &U) const; + VPValue *getUnrollPartOperand(const VPUser &U) const; /// Return the unroll part. - unsigned getUnrollPart(VPUser &U) const; + unsigned getUnrollPart(const VPUser &U) const; }; /// Helper to manage IR metadata for recipes. It filters out metadata that @@ -1662,6 +1662,8 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, VPSlotTracker &SlotTracker) const override; #endif + unsigned getOpcode() const { return Instruction::Select; } + VPValue *getCond() const { return getOperand(0); } @@ -2335,8 +2337,9 @@ public: return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized()); } - /// Generate the phi/select nodes. - void execute(VPTransformState &State) override; + void execute(VPTransformState &State) override { + llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends"); + } /// Return the cost of this VPWidenMemoryRecipe. InstructionCost computeCost(ElementCount VF, @@ -3483,7 +3486,7 @@ public: /// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that /// this is only accurate after the VPlan has been unrolled. - bool isPart0() { return getUnrollPart(*this) == 0; } + bool isPart0() const { return getUnrollPart(*this) == 0; } VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 194874a..5319b8c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -437,9 +437,12 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, // We are about to replace the branch to exit the region. Remove the original // BranchOnCond, if there is any. + DebugLoc LatchDL = DL; if (!LatchVPBB->empty() && - match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) + match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) { + LatchDL = LatchVPBB->getTerminator()->getDebugLoc(); LatchVPBB->getTerminator()->eraseFromParent(); + } VPBuilder Builder(LatchVPBB); // Add a VPInstruction to increment the scalar canonical IV by VF * UF. @@ -452,7 +455,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, // Add the BranchOnCount VPInstruction to the latch. Builder.createNaryOp(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, + LatchDL); } void VPlanTransforms::prepareForVectorization( diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index f0cab79..fc8458c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -184,8 +184,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { VPValue *Cond = SI->getOperand(0); VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]); MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares; - for (const auto &[Idx, Succ] : - enumerate(ArrayRef(Src->getSuccessors()).drop_front())) { + for (const auto &[Idx, Succ] : enumerate(drop_begin(Src->getSuccessors()))) { VPBasicBlock *Dst = cast<VPBasicBlock>(Succ); assert(!getEdgeMask(Src, Dst) && "Edge masks already created"); // Cases whose destination is the same as default are redundant and can @@ -206,7 +205,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { // cases with destination == Dst are taken. Join the conditions for each // case whose destination == Dst using an OR. VPValue *Mask = Conds[0]; - for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front()) + for (VPValue *V : drop_begin(Conds)) Mask = Builder.createOr(Mask, V); if (SrcMask) Mask = Builder.createLogicalAnd(SrcMask, Mask); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1fbc3f3..0d6152b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -413,14 +413,14 @@ void VPSingleDefRecipe::dump() const { VPDef::dump(); } template <unsigned PartOpIdx> VPValue * -VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const { +VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const { if (U.getNumOperands() == PartOpIdx + 1) return U.getOperand(PartOpIdx); return nullptr; } template <unsigned PartOpIdx> -unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const { +unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const { if (auto *UnrollPartOp = getUnrollPartOperand(U)) return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue(); return 0; @@ -991,7 +991,13 @@ bool VPInstruction::isVectorToScalar() const { } bool VPInstruction::isSingleScalar() const { - return getOpcode() == Instruction::PHI || isScalarCast(); + switch (getOpcode()) { + case Instruction::PHI: + case VPInstruction::ExplicitVectorLength: + return true; + default: + return isScalarCast(); + } } void VPInstruction::execute(VPTransformState &State) { @@ -2411,42 +2417,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -void VPBlendRecipe::execute(VPTransformState &State) { - assert(isNormalized() && "Expected blend to be normalized!"); - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // SELECT(Mask1, In1, - // In0))) - // Note that Mask0 is never used: lanes for which no path reaches this phi and - // are essentially undef are taken from In0. - bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); - Value *Result = nullptr; - for (unsigned In = 0; In < NumIncoming; ++In) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), OnlyFirstLaneUsed); - if (In == 0) - Result = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), OnlyFirstLaneUsed); - Result = State.Builder.CreateSelect(Cond, In0, Result, "predphi"); - } - } - State.set(this, Result, OnlyFirstLaneUsed); -} - InstructionCost VPBlendRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { // Handle cases where only the first lane is used the same way as the legacy diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index cb370fe..3d8e149 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -997,7 +997,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // InstSimplifyFolder. if (TypeSwitch<VPRecipeBase *, bool>(&R) .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, - VPReplicateRecipe>([&](auto *I) { + VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) { const DataLayout &DL = Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL, @@ -1883,9 +1883,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( } } -/// Remove BranchOnCond recipes with true or false conditions together with -/// removing dead edges to their successors. -static void removeBranchOnConst(VPlan &Plan) { +void VPlanTransforms::removeBranchOnConst(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_shallow(Plan.getEntry()))) { @@ -1908,12 +1906,9 @@ static void removeBranchOnConst(VPlan &Plan) { "There must be a single edge between VPBB and its successor"); // Values coming from VPBB into phi recipes of RemoveSucc are removed from // these recipes. - for (VPRecipeBase &R : RemovedSucc->phis()) { - auto *Phi = cast<VPPhiAccessors>(&R); - assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) && - "VPIRPhis must have a single predecessor"); - Phi->removeIncomingValueFor(VPBB); - } + for (VPRecipeBase &R : RemovedSucc->phis()) + cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB); + // Disconnect blocks and remove the terminator. RemovedSucc will be deleted // automatically on VPlan destruction if it becomes unreachable. VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc); @@ -2711,6 +2706,18 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, continue; } + // Expand VPBlendRecipe into VPInstruction::Select. + VPBuilder Builder(&R); + if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { + VPValue *Select = Blend->getIncomingValue(0); + for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) + Select = Builder.createSelect(Blend->getMask(I), + Blend->getIncomingValue(I), Select, + R.getDebugLoc(), "predphi"); + Blend->replaceAllUsesWith(Select); + ToRemove.push_back(Blend); + } + if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) { Expr->decompose(); ToRemove.push_back(Expr); @@ -2724,7 +2731,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, // Expand WideIVStep. auto *VPI = cast<VPInstruction>(&R); - VPBuilder Builder(VPI); Type *IVTy = TypeInfo.inferScalarType(VPI); if (TypeInfo.inferScalarType(VectorStep) != IVTy) { Instruction::CastOps CastOp = IVTy->isFloatingPointTy() @@ -3082,6 +3088,29 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { } } +void VPlanTransforms::materializeVectorTripCount( + VPlan &Plan, ElementCount BestVF, unsigned BestUF, + PredicatedScalarEvolution &PSE) { + assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); + assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); + + VPValue *TC = Plan.getTripCount(); + // Skip cases for which the trip count may be non-trivial to materialize. + if (!Plan.hasScalarTail() || + Plan.getMiddleBlock()->getSingleSuccessor() == + Plan.getScalarPreheader() || + !TC->isLiveIn()) + return; + // Materialize vector trip counts for constants early if it can simply + // be computed as (Original TC / VF * UF) * VF * UF. + ScalarEvolution &SE = *PSE.getSE(); + auto *TCScev = SE.getSCEV(TC->getLiveInIRValue()); + const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF); + auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF); + if (auto *NewC = dyn_cast<SCEVConstant>(VecTCScev)) + Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue()); +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index ab189f6..d5af6cd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -224,6 +224,10 @@ struct VPlanTransforms { /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// Remove BranchOnCond recipes with true or false conditions together with + /// removing dead edges to their successors. + static void removeBranchOnConst(VPlan &Plan); + /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. @@ -234,6 +238,12 @@ struct VPlanTransforms { /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors. static void materializeBroadcasts(VPlan &Plan); + // Materialize vector trip counts for constants early if it can simply be + // computed as (Original TC / VF * UF) * VF * UF. + static void materializeVectorTripCount(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE); + /// Try to convert a plan with interleave groups with VF elements to a plan /// with the interleave groups replaced by wide loads and stores processing VF /// elements, if all transformed interleave groups access the full vector diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 82adc34..6252f4f 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -3174,6 +3174,55 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) { return true; } +/// Returns true if this ShuffleVectorInst eventually feeds into a +/// vector reduction intrinsic (e.g., vector_reduce_add) by only following +/// chains of shuffles and binary operators (in any combination/order). +/// The search does not go deeper than the given Depth. +static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) { + constexpr unsigned MaxVisited = 32; + SmallPtrSet<Instruction *, 8> Visited; + SmallVector<Instruction *, 4> WorkList; + bool FoundReduction = false; + + WorkList.push_back(SVI); + while (!WorkList.empty()) { + Instruction *I = WorkList.pop_back_val(); + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (!UI || !Visited.insert(UI).second) + continue; + if (Visited.size() > MaxVisited) + return false; + if (auto *II = dyn_cast<IntrinsicInst>(UI)) { + // More than one reduction reached + if (FoundReduction) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + FoundReduction = true; + continue; + default: + return false; + } + } + + if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI)) + return false; + + WorkList.emplace_back(UI); + } + } + return FoundReduction; +} + /// This method looks for groups of shuffles acting on binops, of the form: /// %x = shuffle ... /// %y = shuffle ... @@ -3416,6 +3465,65 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind); }; + unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits(); + unsigned MaxVectorSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); + unsigned MaxElementsInVector = MaxVectorSize / ElementSize; + // When there are multiple shufflevector operations on the same input, + // especially when the vector length is larger than the register size, + // identical shuffle patterns may occur across different groups of elements. + // To avoid overestimating the cost by counting these repeated shuffles more + // than once, we only account for unique shuffle patterns. This adjustment + // prevents inflated costs in the cost model for wide vectors split into + // several register-sized groups. + std::set<SmallVector<int, 4>> UniqueShuffles; + auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) { + // Compute the cost for performing the shuffle over the full vector. + auto ShuffleCost = + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind); + unsigned NumFullVectors = Mask.size() / MaxElementsInVector; + if (NumFullVectors < 2) + return C + ShuffleCost; + SmallVector<int, 4> SubShuffle(MaxElementsInVector); + unsigned NumUniqueGroups = 0; + unsigned NumGroups = Mask.size() / MaxElementsInVector; + // For each group of MaxElementsInVector contiguous elements, + // collect their shuffle pattern and insert into the set of unique patterns. + for (unsigned I = 0; I < NumFullVectors; ++I) { + for (unsigned J = 0; J < MaxElementsInVector; ++J) + SubShuffle[J] = Mask[MaxElementsInVector * I + J]; + if (UniqueShuffles.insert(SubShuffle).second) + NumUniqueGroups += 1; + } + return C + ShuffleCost * NumUniqueGroups / NumGroups; + }; + auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return C; + SmallVector<int, 16> Mask; + SV->getShuffleMask(Mask); + return AddShuffleMaskAdjustedCost(C, Mask); + }; + // Check that input consists of ShuffleVectors applied to the same input + auto AllShufflesHaveSameOperands = + [](SmallPtrSetImpl<Instruction *> &InputShuffles) { + if (InputShuffles.size() < 2) + return false; + ShuffleVectorInst *FirstSV = + dyn_cast<ShuffleVectorInst>(*InputShuffles.begin()); + if (!FirstSV) + return false; + + Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1); + return std::all_of( + std::next(InputShuffles.begin()), InputShuffles.end(), + [&](Instruction *I) { + ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I); + return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1; + }); + }; + // Get the costs of the shuffles + binops before and after with the new // shuffle masks. InstructionCost CostBefore = @@ -3423,8 +3531,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); - CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), - InstructionCost(0), AddShuffleCost); + if (AllShufflesHaveSameOperands(InputShuffles)) { + UniqueShuffles.clear(); + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleAdjustedCost); + } else { + CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), + InstructionCost(0), AddShuffleCost); + } // The new binops will be unused for lanes past the used shuffle lengths. // These types attempt to get the correct cost for that from the target. @@ -3435,8 +3549,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { InstructionCost CostAfter = TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) + TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind); + UniqueShuffles.clear(); CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), - InstructionCost(0), AddShuffleMaskCost); + InstructionCost(0), AddShuffleMaskAdjustedCost); std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); CostAfter += std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), @@ -3445,7 +3560,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore << " vs CostAfter: " << CostAfter << "\n"); - if (CostBefore <= CostAfter) + if (CostBefore < CostAfter || + (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI))) return false; // The cost model has passed, create the new instructions. |