diff options
Diffstat (limited to 'llvm/lib')
66 files changed, 824 insertions, 216 deletions
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 6f19a68..27114e0 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -206,6 +206,14 @@ static dxil::ElementType toDXILElementType(Type *Ty, bool IsSigned) { return ElementType::Invalid; } +static dxil::ElementType toDXILStorageType(dxil::ElementType ET) { + if (ET == dxil::ElementType::U64 || ET == dxil::ElementType::F64 || + ET == dxil::ElementType::I64 || ET == dxil::ElementType::SNormF64 || + ET == dxil::ElementType::UNormF64) + return dxil::ElementType::U32; + return ET; +} + ResourceTypeInfo::ResourceTypeInfo(TargetExtType *HandleTy, const dxil::ResourceClass RC_, const dxil::ResourceKind Kind_) @@ -569,10 +577,11 @@ ResourceTypeInfo::TypedInfo ResourceTypeInfo::getTyped() const { auto [ElTy, IsSigned] = getTypedElementType(Kind, HandleTy); dxil::ElementType ET = toDXILElementType(ElTy, IsSigned); + dxil::ElementType DXILStorageTy = toDXILStorageType(ET); uint32_t Count = 1; if (auto *VTy = dyn_cast<FixedVectorType>(ElTy)) Count = VTy->getNumElements(); - return {ET, Count}; + return {ET, DXILStorageTy, Count}; } dxil::SamplerFeedbackType ResourceTypeInfo::getFeedbackType() const { @@ -636,7 +645,10 @@ void ResourceTypeInfo::print(raw_ostream &OS, const DataLayout &DL) const { OS << " Alignment: " << Struct.AlignLog2 << "\n"; } else if (isTyped()) { TypedInfo Typed = getTyped(); - OS << " Element Type: " << getElementTypeName(Typed.ElementTy) << "\n" + OS << " Element Type: " << getElementTypeName(Typed.ElementTy); + if (Typed.ElementTy != Typed.DXILStorageTy) + OS << " (stored as " << getElementTypeName(Typed.DXILStorageTy) << ")"; + OS << "\n" << " Element Count: " << Typed.ElementCount << "\n"; } else if (isFeedback()) OS << " Feedback Type: " << getSamplerFeedbackTypeName(getFeedbackType()) @@ -714,7 +726,8 @@ MDTuple *ResourceInfo::getAsMetadata(Module &M, Tags.push_back(getIntMD(RTI.getStruct(DL).Stride)); } else if (RTI.isTyped()) { Tags.push_back(getIntMD(llvm::to_underlying(ExtPropTags::ElementType))); - Tags.push_back(getIntMD(llvm::to_underlying(RTI.getTyped().ElementTy))); + Tags.push_back( + getIntMD(llvm::to_underlying(RTI.getTyped().DXILStorageTy))); } else if (RTI.isFeedback()) { Tags.push_back( getIntMD(llvm::to_underlying(ExtPropTags::SamplerFeedbackKind))); diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index a572eef..84ee8c0 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1131,9 +1131,14 @@ bool DependenceInfo::haveSameSD(const Loop *SrcLoop, if (SE->hasLoopInvariantBackedgeTakenCount(DstLoop)) DstUP = SE->getBackedgeTakenCount(DstLoop); - if (SrcUB != nullptr && DstUP != nullptr && - SE->isKnownPredicate(ICmpInst::ICMP_EQ, SrcUB, DstUP)) - return true; + if (SrcUB != nullptr && DstUP != nullptr) { + Type *WiderType = SE->getWiderType(SrcUB->getType(), DstUP->getType()); + SrcUB = SE->getNoopOrZeroExtend(SrcUB, WiderType); + DstUP = SE->getNoopOrZeroExtend(DstUP, WiderType); + + if (SE->isKnownPredicate(ICmpInst::ICMP_EQ, SrcUB, DstUP)) + return true; + } return false; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7adb25d..e27a9b1 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2982,6 +2982,10 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { if (!StrideExpr) return; + if (auto *Unknown = dyn_cast<SCEVUnknown>(StrideExpr)) + if (isa<UndefValue>(Unknown->getValue())) + return; + LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " "versioning:"); LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n"); diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp index 93ae548..7bef3a8 100644 --- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp +++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp @@ -86,10 +86,7 @@ template <> struct llvm::DenseMapInfo<VariableID> { using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>; template <> struct std::hash<VarLocInsertPt> { - using argument_type = VarLocInsertPt; - using result_type = std::size_t; - - result_type operator()(const argument_type &Arg) const { + std::size_t operator()(const VarLocInsertPt &Arg) const { return std::hash<void *>()(Arg.getOpaqueValue()); } }; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8676060..cf221bb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16481,10 +16481,34 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { DAG, DL); } break; - case ISD::AVGFLOORS: - case ISD::AVGFLOORU: case ISD::AVGCEILS: case ISD::AVGCEILU: + // trunc (avgceilu (sext (x), sext (y))) -> avgceils(x, y) + // trunc (avgceils (zext (x), zext (y))) -> avgceilu(x, y) + if (N0.hasOneUse()) { + SDValue Op0 = N0.getOperand(0); + SDValue Op1 = N0.getOperand(1); + if (N0.getOpcode() == ISD::AVGCEILU) { + if (TLI.isOperationLegalOrCustom(ISD::AVGCEILS, VT) && + Op0.getOpcode() == ISD::SIGN_EXTEND && + Op1.getOpcode() == ISD::SIGN_EXTEND && + Op0.getOperand(0).getValueType() == VT && + Op1.getOperand(0).getValueType() == VT) + return DAG.getNode(ISD::AVGCEILS, DL, VT, Op0.getOperand(0), + Op1.getOperand(0)); + } else { + if (TLI.isOperationLegalOrCustom(ISD::AVGCEILU, VT) && + Op0.getOpcode() == ISD::ZERO_EXTEND && + Op1.getOpcode() == ISD::ZERO_EXTEND && + Op0.getOperand(0).getValueType() == VT && + Op1.getOperand(0).getValueType() == VT) + return DAG.getNode(ISD::AVGCEILU, DL, VT, Op0.getOperand(0), + Op1.getOperand(0)); + } + } + [[fallthrough]]; + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: case ISD::ABDS: case ISD::ABDU: // (trunc (avg a, b)) -> (avg (trunc a), (trunc b)) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index 73df62a..41cea45 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -1344,9 +1344,20 @@ void DWARFContext::dump( DWARFTypeUnit *DWARFContext::getTypeUnitForHash(uint64_t Hash, bool IsDWO) { DWARFUnitVector &DWOUnits = State->getDWOUnits(); if (const auto &TUI = getTUIndex()) { - if (const auto *R = TUI.getFromHash(Hash)) - return dyn_cast_or_null<DWARFTypeUnit>( - DWOUnits.getUnitForIndexEntry(*R)); + if (const auto *R = TUI.getFromHash(Hash)) { + if (TUI.getVersion() >= 5) { + return dyn_cast_or_null<DWARFTypeUnit>( + DWOUnits.getUnitForIndexEntry(*R, DW_SECT_INFO)); + } else { + DWARFUnit *TypesUnit = nullptr; + getDWARFObj().forEachTypesDWOSections([&](const DWARFSection &S) { + if (!TypesUnit) + TypesUnit = + DWOUnits.getUnitForIndexEntry(*R, DW_SECT_EXT_TYPES, &S); + }); + return dyn_cast_or_null<DWARFTypeUnit>(TypesUnit); + } + } return nullptr; } return State->getTypeUnitMap(IsDWO).lookup(Hash); @@ -1358,7 +1369,7 @@ DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) { if (const auto &CUI = getCUIndex()) { if (const auto *R = CUI.getFromHash(Hash)) return dyn_cast_or_null<DWARFCompileUnit>( - DWOUnits.getUnitForIndexEntry(*R)); + DWOUnits.getUnitForIndexEntry(*R, DW_SECT_INFO)); return nullptr; } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp index ef59c82..da0bf03 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -161,17 +161,24 @@ DWARFUnit *DWARFUnitVector::getUnitForOffset(uint64_t Offset) const { return nullptr; } -DWARFUnit * -DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) { - const auto *CUOff = E.getContribution(DW_SECT_INFO); +DWARFUnit *DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E, + DWARFSectionKind Sec, + const DWARFSection *Section) { + const auto *CUOff = E.getContribution(Sec); if (!CUOff) return nullptr; uint64_t Offset = CUOff->getOffset(); - auto end = begin() + getNumInfoUnits(); + auto begin = this->begin(); + auto end = begin + getNumInfoUnits(); + + if (Sec == DW_SECT_EXT_TYPES) { + begin = end; + end = this->end(); + } auto *CU = - std::upper_bound(begin(), end, CUOff->getOffset(), + std::upper_bound(begin, end, CUOff->getOffset(), [](uint64_t LHS, const std::unique_ptr<DWARFUnit> &RHS) { return LHS < RHS->getNextUnitOffset(); }); @@ -181,13 +188,14 @@ DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) { if (!Parser) return nullptr; - auto U = Parser(Offset, DW_SECT_INFO, nullptr, &E); + auto U = Parser(Offset, Sec, Section, &E); if (!U) return nullptr; auto *NewCU = U.get(); this->insert(CU, std::move(U)); - ++NumInfoUnits; + if (Sec == DW_SECT_INFO) + ++NumInfoUnits; return NewCU; } diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index de29330..7290a86 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -2499,7 +2499,7 @@ void PrintCrashIRInstrumentation::registerCallbacks( [&PIC, this](StringRef PassID, Any IR) { SavedIR.clear(); raw_string_ostream OS(SavedIR); - OS << formatv("*** Dump of {0}IR Before Last Pass {1}", + OS << formatv("; *** Dump of {0}IR Before Last Pass {1}", llvm::forcePrintModuleIR() ? "Module " : "", PassID); if (!isInteresting(IR, PassID, PIC.getPassNameForClassName(PassID))) { OS << " Filtered Out ***\n"; diff --git a/llvm/lib/Remarks/Remark.cpp b/llvm/lib/Remarks/Remark.cpp index 0e98cad..09f24e9 100644 --- a/llvm/lib/Remarks/Remark.cpp +++ b/llvm/lib/Remarks/Remark.cpp @@ -13,6 +13,7 @@ #include "llvm/Remarks/Remark.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include <optional> using namespace llvm; @@ -26,16 +27,13 @@ std::string Remark::getArgsAsMsg() const { return Str; } -/// Returns the value of a specified key parsed from StringRef. -std::optional<int> Argument::getValAsInt() const { - APInt KeyVal; - if (Val.getAsInteger(10, KeyVal)) - return std::nullopt; - return KeyVal.getSExtValue(); +Argument *Remark::getArgByKey(StringRef Key) { + auto *It = find_if(Args, [&](auto &Arg) { return Arg.Key == Key; }); + if (It == Args.end()) + return nullptr; + return &*It; } -bool Argument::isValInt() const { return getValAsInt().has_value(); } - void RemarkLocation::print(raw_ostream &OS) const { OS << "{ " << "File: " << SourceFilePath << ", Line: " << SourceLine diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 3a97185..246d90c 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -94,6 +94,19 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { StringRef Prefix = G.Pattern.prefix(); StringRef Suffix = G.Pattern.suffix(); + if (Suffix.empty() && Prefix.empty()) { + // If both prefix and suffix are empty put into special tree to search by + // substring in a middle. + StringRef Substr = G.Pattern.longest_substr(); + if (!Substr.empty()) { + // But only if substring is not empty. Searching this tree is more + // expensive. + auto &V = SubstrToGlob.emplace(Substr).first->second; + V.emplace_back(&G); + continue; + } + } + auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second; auto &V = SToGlob.emplace(reverse(Suffix)).first->second; V.emplace_back(&G); @@ -119,6 +132,25 @@ void SpecialCaseList::GlobMatcher::match( } } } + + if (!SubstrToGlob.empty()) { + // As we don't know when substring exactly starts, we will try all + // possibilities. In most cases search will fail on first characters. + for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) { + for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) { + for (const auto *G : V) { + if (G->Pattern.match(Query)) { + Cb(G->Name, G->LineNo); + // As soon as we find a match in the vector, we can break for this + // vector, since the globs are already sorted by priority within the + // prefix group. However, we continue searching other prefix groups + // in the map, as they may contain a better match overall. + break; + } + } + } + } + } } SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp index 137ff89..f13554f 100644 --- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -47,6 +47,8 @@ public: StringRef getPassName() const override { return AARCH64_BRANCH_TARGETS_NAME; } private: + const AArch64Subtarget *Subtarget; + void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump, bool NeedsWinCFI); }; @@ -75,6 +77,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { << "********** Function: " << MF.getName() << '\n'); const Function &F = MF.getFunction(); + Subtarget = &MF.getSubtarget<AArch64Subtarget>(); + // LLVM does not consider basic blocks which are the targets of jump tables // to be address-taken (the address can't escape anywhere else), but they are // used for indirect branches, so need BTI instructions. @@ -100,9 +104,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { // a BTI, and pointing the indirect branch at that. For non-ELF targets we // can't rely on that, so we assume that `CouldCall` is _always_ true due // to the risk of long-branch thunks at link time. - if (&MBB == &*MF.begin() && - (!MF.getSubtarget<AArch64Subtarget>().isTargetELF() || - (F.hasAddressTaken() || !F.hasLocalLinkage()))) + if (&MBB == &*MF.begin() && (!Subtarget->isTargetELF() || + (F.hasAddressTaken() || !F.hasLocalLinkage()))) CouldCall = true; // If the block itself is address-taken, it could be indirectly branched @@ -132,9 +135,6 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, << (CouldCall ? "c" : "") << " to " << MBB.getName() << "\n"); - const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>( - MBB.getParent()->getSubtarget().getInstrInfo()); - unsigned HintNum = 32; if (CouldCall) HintNum |= 2; @@ -162,6 +162,8 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall, MBBI->getOpcode() == AArch64::PACIBSP)) return; + const AArch64InstrInfo *TII = Subtarget->getInstrInfo(); + // Insert BTI exactly at the first executable instruction. const DebugLoc DL = MBB.findDebugLoc(MBBI); MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT)) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 1e607f4..f63981b 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1871,7 +1871,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { } bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); bool Modified = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index c76689f..0f7b34c 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -644,10 +644,10 @@ bool AArch64FrameLowering::hasReservedCallFrame( MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - const AArch64TargetLowering *TLI = - MF.getSubtarget<AArch64Subtarget>().getTargetLowering(); + + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64InstrInfo *TII = Subtarget.getInstrInfo(); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo(); DebugLoc DL = I->getDebugLoc(); unsigned Opc = I->getOpcode(); @@ -1319,8 +1319,8 @@ StackOffset AArch64FrameLowering::getStackOffset(const MachineFunction &MF, // TODO: This function currently does not work for scalable vectors. int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const { - const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = + MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); return RegInfo->getLocalAddressRegister(MF) == AArch64::FP ? getFPOffset(MF, ObjectOffset).getFixed() @@ -1343,10 +1343,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( TargetStackID::Value StackID, Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); - const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed(); int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); @@ -1466,7 +1465,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( return FPOffset; } FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() - : (unsigned)AArch64::SP; + : MCRegister(AArch64::SP); return SPOffset; } @@ -1589,8 +1588,8 @@ static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, namespace { struct RegPairInfo { - unsigned Reg1 = AArch64::NoRegister; - unsigned Reg2 = AArch64::NoRegister; + Register Reg1; + Register Reg2; int FrameIdx; int Offset; enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; @@ -1598,21 +1597,21 @@ struct RegPairInfo { RegPairInfo() = default; - bool isPaired() const { return Reg2 != AArch64::NoRegister; } + bool isPaired() const { return Reg2.isValid(); } bool isScalable() const { return Type == PPR || Type == ZPR; } }; } // end anonymous namespace -unsigned findFreePredicateReg(BitVector &SavedRegs) { +MCRegister findFreePredicateReg(BitVector &SavedRegs) { for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) { if (SavedRegs.test(PReg)) { unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0; - return PNReg; + return MCRegister(PNReg); } } - return AArch64::NoRegister; + return MCRegister(); } // The multivector LD/ST are available only for SME or SVE2p1 targets @@ -1930,8 +1929,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } bool PTrueCreated = false; for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) { - unsigned Reg1 = RPI.Reg1; - unsigned Reg2 = RPI.Reg2; + Register Reg1 = RPI.Reg1; + Register Reg2 = RPI.Reg2; unsigned StrOpc; // Issue sequence of spills for cs regs. The first spill may be converted @@ -1967,7 +1966,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( break; } - unsigned X0Scratch = AArch64::NoRegister; + Register X0Scratch; auto RestoreX0 = make_scope_exit([&] { if (X0Scratch != AArch64::NoRegister) BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), AArch64::X0) @@ -2009,11 +2008,15 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } } - LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); - if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); - dbgs() << ") -> fi#(" << RPI.FrameIdx; - if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; - dbgs() << ")\n"); + LLVM_DEBUG({ + dbgs() << "CSR spill: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx + 1; + dbgs() << ")\n"; + }); assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) && "Windows unwdinding requires a consecutive (FP,LR) pair"); @@ -2143,8 +2146,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( bool PTrueCreated = false; for (const RegPairInfo &RPI : RegPairs) { - unsigned Reg1 = RPI.Reg1; - unsigned Reg2 = RPI.Reg2; + Register Reg1 = RPI.Reg1; + Register Reg2 = RPI.Reg2; // Issue sequence of restores for cs regs. The last restore may be converted // to a post-increment load later by emitEpilogue if the callee-save stack @@ -2176,11 +2179,15 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( case RegPairInfo::VG: continue; } - LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); - if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); - dbgs() << ") -> fi#(" << RPI.FrameIdx; - if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1; - dbgs() << ")\n"); + LLVM_DEBUG({ + dbgs() << "CSR restore: (" << printReg(Reg1, TRI); + if (RPI.isPaired()) + dbgs() << ", " << printReg(Reg2, TRI); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx + 1; + dbgs() << ")\n"; + }); // Windows unwind codes require consecutive registers if registers are // paired. Make the switch here, so that the code below will save (x,x+1) @@ -2435,8 +2442,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2444,9 +2450,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); - unsigned BasePointerReg = RegInfo->hasBasePointer(MF) - ? RegInfo->getBaseRegister() - : (unsigned)AArch64::NoRegister; + MCRegister BasePointerReg = + RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() : MCRegister(); unsigned ExtraCSSpill = 0; bool HasUnpairedGPR64 = false; @@ -2456,7 +2461,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { - const unsigned Reg = CSRegs[i]; + const MCRegister Reg = CSRegs[i]; // Add the base pointer register to SavedRegs if it is callee-save. if (Reg == BasePointerReg) @@ -2470,7 +2475,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } bool RegUsed = SavedRegs.test(Reg); - unsigned PairedReg = AArch64::NoRegister; + MCRegister PairedReg; const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg); if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) || AArch64::FPR128RegClass.contains(Reg)) { @@ -2522,8 +2527,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // Find a suitable predicate register for the multi-vector spill/fill // instructions. - unsigned PnReg = findFreePredicateReg(SavedRegs); - if (PnReg != AArch64::NoRegister) + MCRegister PnReg = findFreePredicateReg(SavedRegs); + if (PnReg.isValid()) AFI->setPredicateRegForFillSpill(PnReg); // If no free callee-save has been found assign one. if (!AFI->getPredicateRegForFillSpill() && @@ -2558,7 +2563,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned PPRCSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (unsigned Reg : SavedRegs.set_bits()) { - auto *RC = TRI->getMinimalPhysRegClass(Reg); + auto *RC = TRI->getMinimalPhysRegClass(MCRegister(Reg)); assert(RC && "expected register class!"); auto SpillSize = TRI->getSpillSize(*RC); bool IsZPR = AArch64::ZPRRegClass.contains(Reg); @@ -2600,7 +2605,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, LLVM_DEBUG({ dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) - dbgs() << ' ' << printReg(Reg, RegInfo); + dbgs() << ' ' << printReg(MCRegister(Reg), RegInfo); dbgs() << "\n"; }); diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp index d67182d..03dd1cd 100644 --- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp +++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp @@ -649,7 +649,7 @@ bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) { } bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); bool Modified = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 79975b0..5bfb19d9 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -620,7 +620,7 @@ AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { return RC; } -unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } +MCRegister AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 47d76f3..3b0f4f6 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -124,7 +124,7 @@ public: bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; - unsigned getBaseRegister() const; + MCRegister getBaseRegister() const; bool isArgumentRegister(const MachineFunction &MF, MCRegister Reg) const override; diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index d695f26..b4a4f4c 100644 --- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -33,6 +33,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -49,8 +50,8 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" -#include <unordered_map> #include <map> +#include <unordered_map> using namespace llvm; @@ -67,7 +68,7 @@ namespace { struct AArch64SIMDInstrOpt : public MachineFunctionPass { static char ID; - const TargetInstrInfo *TII; + const AArch64InstrInfo *TII; MachineRegisterInfo *MRI; TargetSchedModel SchedModel; @@ -694,13 +695,9 @@ bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - TII = MF.getSubtarget().getInstrInfo(); MRI = &MF.getRegInfo(); - const TargetSubtargetInfo &ST = MF.getSubtarget(); - const AArch64InstrInfo *AAII = - static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); - if (!AAII) - return false; + const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); + TII = ST.getInstrInfo(); SchedModel.init(&ST); if (!SchedModel.hasInstrSchedModel()) return false; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 5c3e26e..4cd51d6 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1114,7 +1114,6 @@ bool AArch64InstPrinter::printSyslAlias(const MCInst *MI, } else return false; - std::string Str; llvm::transform(Name, Name.begin(), ::tolower); O << '\t' << Ins << '\t' << Reg.str() << ", " << Name; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1c8383c..54d94b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1466,6 +1466,13 @@ def FeatureClusters : SubtargetFeature< "clusters", "Has clusters of workgroups support" >; +def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature< + "waits-before-system-scope-stores", + "RequiresWaitsBeforeSystemScopeStores", + "true", + "Target requires waits for loads and atomics before system scope stores" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -2060,7 +2067,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureMaxHardClauseLength32, Feature1_5xVGPRs, FeatureMemoryAtomicFAddF32DenormalSupport, - FeatureBVHDualAndBVH8Insts + FeatureBVHDualAndBVH8Insts, + FeatureWaitsBeforeSystemScopeStores, ]>; def FeatureISAVersion12_50 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5580e4c..09338c5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9028,6 +9028,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); + // Parse a dummy operand as a placeholder for the SWZ operand. This enforces + // agreement between MCInstrDesc.getNumOperands and MCInst.getNumOperands. + Inst.addOperand(MCOperand::createImm(0)); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ac660d5..f377b8a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -290,6 +290,7 @@ protected: bool Has45BitNumRecordsBufferResource = false; bool HasClusters = false; + bool RequiresWaitsBeforeSystemScopeStores = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1861,6 +1862,10 @@ public: bool has45BitNumRecordsBufferResource() const { return Has45BitNumRecordsBufferResource; } + + bool requiresWaitsBeforeSystemScopeStores() const { + return RequiresWaitsBeforeSystemScopeStores; + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index a177a42..6ab8d552 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2673,7 +2673,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. - if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS) + if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic && + Scope == CPol::SCOPE_SYS) Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator()); return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ebd2e7e..d80a6f3 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1874,9 +1874,13 @@ void SIRegisterInfo::buildSpillLoadStore( } bool IsSrcDstDef = SrcDstRegState & RegState::Define; + bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore; if (NeedSuperRegImpOperand && - (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) + (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) { MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + if (PartialReloadCopy) + MIB.addReg(ValueReg, RegState::Implicit); + } // The epilog restore of a wwm-scratch register can cause undesired // optimization during machine-cp post PrologEpilogInserter if the same diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b6..1931e0b 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -838,9 +838,10 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo < let SubtargetPredicate = isGFX6GFX7GFX8GFX9; } -let Defs = [SCC] in { -def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; -} // End Defs = [SCC] +let isCommutable = 1, Defs = [SCC] in +def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32", + [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))] +>; let SubtargetPredicate = isGFX8GFX9 in { def S_RFE_RESTORE_B64 : SOP2_Pseudo < diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td index 7453727..b60569e 100644 --- a/llvm/lib/Target/ARM/ARMProcessors.td +++ b/llvm/lib/Target/ARM/ARMProcessors.td @@ -421,6 +421,17 @@ def : ProcessorModel<"cortex-m52", CortexM55Model, [ARMv81mMainline, FeatureMVEVectorCostFactor1, HasMVEFloatOps]>; +def : ProcessorModel<"star-mc3", CortexM55Model, [ARMv81mMainline, + FeatureDSP, + FeatureFPARMv8_D16, + FeatureHasNoBranchPredictor, + FeaturePACBTI, + FeatureUseMISched, + FeaturePreferBranchAlign32, + FeatureHasSlowFPVMLx, + FeatureMVEVectorCostFactor1, + HasMVEFloatOps]>; + def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp index dc84ae4..9da3bdb 100644 --- a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp +++ b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp @@ -49,7 +49,7 @@ static StringRef getRCPrefix(dxil::ResourceClass RC) { static StringRef getFormatName(const dxil::ResourceTypeInfo &RI) { if (RI.isTyped()) { - switch (RI.getTyped().ElementTy) { + switch (RI.getTyped().DXILStorageTy) { case dxil::ElementType::I1: return "i1"; case dxil::ElementType::I16: diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index ca4a655..80c96c6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -1701,6 +1701,43 @@ lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, DAG.getConstant(Imm, DL, GRLenVT)); } +/// Lower VECTOR_SHUFFLE whose result is the reversed source vector. +/// +/// It is possible to do optimization for VECTOR_SHUFFLE performing vector +/// reverse whose mask likes: +/// <7, 6, 5, 4, 3, 2, 1, 0> +/// +/// When undef's appear in the mask they are treated as if they were whatever +/// value is necessary in order to fit the above forms. +static SDValue +lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + // Only vectors with i8/i16 elements which cannot match other patterns + // directly needs to do this. + if (VT != MVT::v16i8 && VT != MVT::v8i16 && VT != MVT::v32i8 && + VT != MVT::v16i16) + return SDValue(); + + if (!ShuffleVectorInst::isReverseMask(Mask, Mask.size())) + return SDValue(); + + int WidenNumElts = VT.getVectorNumElements() / 4; + SmallVector<int, 16> WidenMask(WidenNumElts, -1); + for (int i = 0; i < WidenNumElts; ++i) + WidenMask[i] = WidenNumElts - 1 - i; + + MVT WidenVT = MVT::getVectorVT( + VT.getVectorElementType() == MVT::i8 ? MVT::i32 : MVT::i64, WidenNumElts); + SDValue NewV1 = DAG.getBitcast(WidenVT, V1); + SDValue WidenRev = DAG.getVectorShuffle(WidenVT, DL, NewV1, + DAG.getUNDEF(WidenVT), WidenMask); + + return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, + DAG.getBitcast(VT, WidenRev), + DAG.getConstant(27, DL, Subtarget.getGRLenVT())); +} + /// Lower VECTOR_SHUFFLE into VPACKEV (if possible). /// /// VPACKEV interleaves the even elements from each vector. @@ -2004,6 +2041,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; + if ((Result = + lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget))) + return Result; // TODO: This comment may be enabled in the future to better match the // pattern for instruction selection. @@ -2622,6 +2662,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, return Result; if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget))) return Result; + if ((Result = + lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget))) + return Result; // TODO: This comment may be enabled in the future to better match the // pattern for instruction selection. diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 598735f..c923f0e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1082,6 +1082,161 @@ let Predicates = [hasPTX<70>, hasSM<80>] in { "mbarrier.pending_count.b64", [(set i32:$res, (int_nvvm_mbarrier_pending_count i64:$state))]>; } + +class MBAR_UTIL<string op, string scope, + string space = "", string sem = "", + bit tl = 0, bit parity = 0> { + // The mbarrier instructions in PTX ISA are of the general form: + // mbarrier.op.semantics.scope.space.b64 arg1, arg2 ... + // where: + // op -> arrive, expect_tx, complete_tx, arrive.expect_tx etc. + // semantics -> acquire, release, relaxed (default depends on the op) + // scope -> cta or cluster (default is cta-scope) + // space -> shared::cta or shared::cluster (default is shared::cta) + // + // The 'semantics' and 'scope' go together. If one is specified, + // then the other _must_ be specified. For example: + // (A) mbarrier.arrive <args> (valid, release and cta are default) + // (B) mbarrier.arrive.release.cta <args> (valid, sem/scope mentioned explicitly) + // (C) mbarrier.arrive.release <args> (invalid, needs scope) + // (D) mbarrier.arrive.cta <args> (invalid, needs order) + // + // Wherever possible, we prefer form (A) to (B) since it is available + // from early PTX versions. In most cases, explicitly specifying the + // scope requires a later version of PTX. + string _scope_asm = !cond( + !eq(scope, "scope_cluster") : "cluster", + !eq(scope, "scope_cta") : !if(!empty(sem), "", "cta"), + true : scope); + string _space_asm = !cond( + !eq(space, "space_cta") : "shared", + !eq(space, "space_cluster") : "shared::cluster", + true : space); + + string _parity = !if(parity, "parity", ""); + string asm_str = StrJoin<".", ["mbarrier", op, _parity, + sem, _scope_asm, _space_asm, "b64"]>.ret; + + string _intr_suffix = StrJoin<"_", [!subst(".", "_", op), _parity, + !if(tl, "tl", ""), + sem, scope, space]>.ret; + string intr_name = "int_nvvm_mbarrier_" # _intr_suffix; + + // Predicate checks: + // These are used only for the "test_wait/try_wait" variants as they + // have evolved since sm80 and are complex. The predicates for the + // remaining instructions are straightforward and have already been + // applied directly. + Predicate _sm_pred = !cond(!or( + !eq(op, "try_wait"), + !eq(scope, "scope_cluster"), + !eq(sem, "relaxed")) : hasSM<90>, + true : hasSM<80>); + Predicate _ptx_pred = !cond( + !eq(sem, "relaxed") : hasPTX<86>, + !ne(_scope_asm, "") : hasPTX<80>, + !eq(op, "try_wait") : hasPTX<78>, + parity : hasPTX<71>, + true : hasPTX<70>); + list<Predicate> preds = [_ptx_pred, _sm_pred]; +} + +foreach op = ["expect_tx", "complete_tx"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + foreach space = ["space_cta", "space_cluster"] in { + defvar intr = !cast<Intrinsic>(MBAR_UTIL<op, scope, space>.intr_name); + defvar suffix = StrJoin<"_", [op, scope, space]>.ret; + def mbar_ # suffix : BasicNVPTXInst<(outs), (ins ADDR:$addr, B32:$tx_count), + MBAR_UTIL<op, scope, space, "relaxed">.asm_str, + [(intr addr:$addr, i32:$tx_count)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + } // space + } // scope +} // op + +multiclass MBAR_ARR_INTR<string op, string scope, string sem, + list<Predicate> pred = []> { + // When either of sem or scope is non-default, both have to + // be explicitly specified. So, explicitly state that + // sem is `release` when scope is `cluster`. + defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")), + "release", sem); + + defvar asm_cta = MBAR_UTIL<op, scope, "space_cta", asm_sem>.asm_str; + defvar intr_cta = !cast<Intrinsic>(MBAR_UTIL<op, scope, + "space_cta", sem>.intr_name); + + defvar asm_cluster = MBAR_UTIL<op, scope, "space_cluster", asm_sem>.asm_str; + defvar intr_cluster = !cast<Intrinsic>(MBAR_UTIL<op, scope, + "space_cluster", sem>.intr_name); + + def _CTA : NVPTXInst<(outs B64:$state), + (ins ADDR:$addr, B32:$tx_count), + asm_cta # " $state, [$addr], $tx_count;", + [(set i64:$state, (intr_cta addr:$addr, i32:$tx_count))]>, + Requires<pred>; + def _CLUSTER : NVPTXInst<(outs), + (ins ADDR:$addr, B32:$tx_count), + asm_cluster # " _, [$addr], $tx_count;", + [(intr_cluster addr:$addr, i32:$tx_count)]>, + Requires<pred>; +} +foreach op = ["arrive", "arrive.expect_tx", + "arrive_drop", "arrive_drop.expect_tx"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + defvar suffix = !subst(".", "_", op) # scope; + defm mbar_ # suffix # _release : MBAR_ARR_INTR<op, scope, "", [hasPTX<80>, hasSM<90>]>; + defm mbar_ # suffix # _relaxed : MBAR_ARR_INTR<op, scope, "relaxed", [hasPTX<86>, hasSM<90>]>; + } // scope +} // op + +multiclass MBAR_WAIT_INTR<string op, string scope, string sem, bit time_limit> { + // When either of sem or scope is non-default, both have to + // be explicitly specified. So, explicitly state that the + // semantics is `acquire` when the scope is `cluster`. + defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")), + "acquire", sem); + + defvar asm_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit, 1>.asm_str; + defvar pred_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit, 1>.preds; + defvar intr_parity = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta", + sem, time_limit, 1>.intr_name); + + defvar asm_state = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit>.asm_str; + defvar pred_state = MBAR_UTIL<op, scope, "space_cta", asm_sem, + time_limit>.preds; + defvar intr_state = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta", + sem, time_limit>.intr_name); + + defvar ins_tl_dag = !if(time_limit, (ins B32:$tl), (ins)); + defvar tl_suffix = !if(time_limit, ", $tl;", ";"); + defvar intr_state_dag = !con((intr_state addr:$addr, i64:$state), + !if(time_limit, (intr_state i32:$tl), (intr_state))); + defvar intr_parity_dag = !con((intr_parity addr:$addr, i32:$phase), + !if(time_limit, (intr_parity i32:$tl), (intr_parity))); + + def _STATE : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B64:$state), ins_tl_dag), + asm_state # " $res, [$addr], $state" # tl_suffix, + [(set i1:$res, intr_state_dag)]>, + Requires<pred_state>; + def _PARITY : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B32:$phase), ins_tl_dag), + asm_parity # " $res, [$addr], $phase" # tl_suffix, + [(set i1:$res, intr_parity_dag)]>, + Requires<pred_parity>; +} +foreach op = ["test_wait", "try_wait"] in { + foreach scope = ["scope_cta", "scope_cluster"] in { + foreach time_limit = !if(!eq(op, "try_wait"), [true, false], [false]) in { + defvar suffix = StrJoin<"_", [op, scope, !if(time_limit, "tl", "")]>.ret; + defm mbar_ # suffix # "_acquire" : MBAR_WAIT_INTR<op, scope, "", time_limit>; + defm mbar_ # suffix # "_relaxed" : MBAR_WAIT_INTR<op, scope, "relaxed", time_limit>; + } // time_limit + } // scope +} // op + //----------------------------------- // Math Functions //----------------------------------- diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td index 92af04a..4695a6f 100644 --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -825,8 +825,7 @@ def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read, P10F2_Read, P10F2_Read], def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read], (instrs SRADI_rec, - SRAWI_rec, - SRAWI8_rec + SRAWI8_rec, SRAWI_rec )>; // Single crack instructions @@ -834,8 +833,7 @@ def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read], def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read, P10F2_Read], (instrs SRAD_rec, - SRAW_rec, - SRAW8_rec + SRAW8_rec, SRAW_rec )>; // 2-way crack instructions @@ -883,7 +881,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY], // 3 Cycles ALU operations, 1 input operands def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read], (instrs - ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, ADDItocL8, LI, LI8, + ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, LI, LI8, ADDIC, ADDIC8, ADDIS, ADDIS8, ADDISdtprelHA32, ADDIStocHA, ADDIStocHA8, LIS, LIS8, ADDME, ADDME8, @@ -1864,7 +1862,7 @@ def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10ST_Read, P10ST_Read] (instrs CP_PASTE8_rec, CP_PASTE_rec, SLBIEG, - TLBIE + TLBIE, TLBIE8P9, TLBIEP9 )>; // Single crack instructions @@ -1886,8 +1884,7 @@ def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10ST_Read, P10ST_Read, def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY], (instrs ISYNC, - SYNCP10, - SYNC + SYNC, SYNCP10 )>; // Expand instructions diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td index 801ae83..3f5f7d3 100644 --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -905,7 +905,7 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_3SLOTS_1C], SLBIEG, STMW, STSWI, - TLBIE + TLBIE, TLBIEP9, TLBIE8P9 )>; // Vector Store Instruction diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 4ff2f8a..5d9ec4a 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -409,6 +409,7 @@ def HasP10Vector : Predicate<"Subtarget->hasP10Vector()">; def IsISA2_06 : Predicate<"Subtarget->isISA2_06()">; def IsISA2_07 : Predicate<"Subtarget->isISA2_07()">; def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">; +def IsNotISA3_0 : Predicate<"!Subtarget->isISA3_0()">; def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">; def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">; def IsISAFuture : Predicate<"Subtarget->isISAFuture()">; diff --git a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def index 6bb66bc..043c9e4 100644 --- a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def +++ b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def @@ -29,7 +29,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1, ADDIStocHA8, ADDIdtprelL32, ADDItlsldLADDR32, - ADDItocL8, + ADDItocL, ADDME, ADDME8, ADDME8O, @@ -209,7 +209,9 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1, SRADI, SRADI_32, SRAW, + SRAW8, SRAWI, + SRAWI8, SRD, SRD_rec, SRW, @@ -518,7 +520,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1, ADDIStocHA8, ADDIdtprelL32, ADDItlsldLADDR32, - ADDItocL8, + ADDItocL, ADDME, ADDME8, ADDME8O, @@ -747,7 +749,9 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1, SRADI, SRADI_32, SRAW, + SRAW8, SRAWI, + SRAWI8, SRD, SRD_rec, SRW, diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index fba1c66..1a77b00 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -850,6 +850,36 @@ class XForm_45<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } +class XForm_RSB5_UIMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, NoItinerary> { + + bits<5> RS; + bits<5> RB; + bits<2> RIC; + + let Pattern = pattern; + + let Inst{6...10} = RS; + let Inst{11} = 0; + let Inst{12...13} = RIC; + let Inst{14...15} = 0; + let Inst{16...20} = RB; + let Inst{21...30} = xo; + let Inst{31} = 0; +} + +class XForm_RSB5_UIMM2_2UIMM1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, list<dag> pattern> + : XForm_RSB5_UIMM2<opcode, xo, OOL, IOL, asmstr, pattern> { + + bits<1> PRS; + bits<1> R; + + let Inst{14} = PRS; + let Inst{15} = R; +} + class X_FRT5_XO2_XO3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 1aefea1..b0bed71c 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -11,6 +11,18 @@ // //===----------------------------------------------------------------------===// +class XForm_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + list<dag> pattern> : I<opcode, OOL, IOL, asmstr, NoItinerary> { + bits<5> RS; + + let Pattern = pattern; + + let Inst{6...10} = RS; + let Inst{11...20} = 0; + let Inst{21...30} = xo; + let Inst{31} = 0; +} + class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, list<dag> pattern> : I<opcode, OOL, IOL, asmstr, NoItinerary> { @@ -294,6 +306,24 @@ let Predicates = [IsISAFuture] in { defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, u1imm:$L), "subfus", "$RT, $L, $RA, $RB", []>; + def TLBSYNCIO + : XForm_RS5<31, 564, (outs), (ins g8rc:$RS), "tlbsyncio $RS", []>; + def PTESYNCIO + : XForm_RS5<31, 596, (outs), (ins g8rc:$RS), "ptesyncio $RS", []>; + def TLBIEP : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs), + (ins gprc:$RB, gprc:$RS, u2imm:$RIC, + u1imm:$PRS, u1imm:$R), + "tlbiep $RB, $RS, $RIC, $PRS, $R", []>; + def TLBIEIO + : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC), + "tlbieio $RB, $RS, $RIC", []>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def TLBIEP8 + : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs), + (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC, + u1imm:$PRS, u1imm:$R), + "tlbiep $RB, $RS, $RIC, $PRS, $R", []>; + } } let Predicates = [HasVSX, IsISAFuture] in { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 44d1a44..f399811 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -4321,7 +4321,22 @@ def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB), "tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>; def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RST, gprc:$RB), - "tlbie $RB,$RST", IIC_SprTLBIE, []>; + "tlbie $RB, $RST", IIC_SprTLBIE, []>, + Requires<[IsNotISA3_0]>; + +let Predicates = [IsISA3_0] in { + def TLBIEP9 : XForm_RSB5_UIMM2_2UIMM1<31, 306, (outs), + (ins gprc:$RB, gprc:$RS, u2imm:$RIC, + u1imm:$PRS, u1imm:$R), + "tlbie $RB, $RS, $RIC, $PRS, $R", []>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def TLBIE8P9 + : XForm_RSB5_UIMM2_2UIMM1<31, 306, (outs), + (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC, + u1imm:$PRS, u1imm:$R), + "tlbie $RB, $RS, $RIC, $PRS, $R", []>; + } +} def TLBSX : XForm_tlb<914, (outs), (ins gprc:$RA, gprc:$RB), "tlbsx $RA, $RB", IIC_LdStLoad>, Requires<[IsBookE]>; @@ -4669,7 +4684,11 @@ def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>; } -def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>; +def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>, Requires<[IsNotISA3_0]>; +let Predicates = [IsISA3_0] in { + def : InstAlias<"tlbie $RB", (TLBIEP9 R0, gprc:$RB, 0, 0, 0)>; + def : InstAlias<"tlbie $RB, $RS", (TLBIEP9 gprc:$RB, gprc:$RS, 0, 0, 0)>; +} def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>, Requires<[IsPPC4xx]>; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 53633ea..8198173 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -92,6 +92,8 @@ private: void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, MachineIRBuilder &MIB) const; bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const; + bool selectIntrinsicWithSideEffects(MachineInstr &I, + MachineIRBuilder &MIB) const; ComplexRendererFns selectShiftMask(MachineOperand &Root, unsigned ShiftWidth) const; @@ -714,6 +716,88 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { return GenericOpc; } +bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( + MachineInstr &I, MachineIRBuilder &MIB) const { + // Find the intrinsic ID. + unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID(); + // Select the instruction. + switch (IntrinID) { + default: + return false; + case Intrinsic::riscv_vlm: + case Intrinsic::riscv_vle: + case Intrinsic::riscv_vle_mask: + case Intrinsic::riscv_vlse: + case Intrinsic::riscv_vlse_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vle_mask || + IntrinID == Intrinsic::riscv_vlse_mask; + bool IsStrided = IntrinID == Intrinsic::riscv_vlse || + IntrinID == Intrinsic::riscv_vlse_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + SrcOps.push_back(Register(RISCV::NoRegister)); + } + + // Base Pointer + auto PtrReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PtrReg); + + // Stride + if (IsStrided) { + auto StrideReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(StrideReg); + } + + // Mask + if (IsMasked) { + auto MaskReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(MaskReg); + } + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + const RISCV::VLEPseudo *P = + RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW, + static_cast<unsigned>(LMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } + } +} + bool RISCVInstructionSelector::select(MachineInstr &MI) { MachineIRBuilder MIB(MI); @@ -984,6 +1068,8 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { return constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI); } + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + return selectIntrinsicWithSideEffects(MI, MIB); default: return false; } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h index d76180c..ea41716 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h @@ -245,7 +245,10 @@ struct ExtendedBuiltin { enum InstFlags { // It is a half type - INST_PRINTER_WIDTH16 = 1 + INST_PRINTER_WIDTH16 = 1, + // It is a 64-bit type + INST_PRINTER_WIDTH64 = INST_PRINTER_WIDTH16 << 1, + }; } // namespace SPIRV diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index 35a2ee1..62f5e47 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -167,6 +167,36 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address, MI, FirstVariableIndex, OS); printRemainingVariableOps(MI, FirstVariableIndex + 1, OS); break; + case SPIRV::OpSwitch: + if (MI->getFlags() & SPIRV::INST_PRINTER_WIDTH64) { + // In binary format 64-bit types are split into two 32-bit operands, + // but in text format combine these into a single 64-bit value as + // this is what tools such as spirv-as require. + const unsigned NumOps = MI->getNumOperands(); + for (unsigned OpIdx = NumFixedOps; OpIdx < NumOps;) { + if (OpIdx + 1 >= NumOps || !MI->getOperand(OpIdx).isImm() || + !MI->getOperand(OpIdx + 1).isImm()) { + llvm_unreachable("Unexpected OpSwitch operands"); + continue; + } + OS << ' '; + uint64_t LowBits = MI->getOperand(OpIdx).getImm(); + uint64_t HighBits = MI->getOperand(OpIdx + 1).getImm(); + uint64_t CombinedValue = (HighBits << 32) | LowBits; + OS << formatImm(CombinedValue); + OpIdx += 2; + + // Next should be the label + if (OpIdx < NumOps) { + OS << ' '; + printOperand(MI, OpIdx, OS); + OpIdx++; + } + } + } else { + printRemainingVariableOps(MI, NumFixedOps, OS); + } + break; case SPIRV::OpImageSampleImplicitLod: case SPIRV::OpImageSampleDrefImplicitLod: case SPIRV::OpImageSampleProjImplicitLod: diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h index 4de9d6a..4c5b81f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h @@ -62,7 +62,9 @@ public: namespace SPIRV { enum AsmComments { // It is a half type - ASM_PRINTER_WIDTH16 = MachineInstr::TAsmComments + ASM_PRINTER_WIDTH16 = MachineInstr::TAsmComments, + // It is a 64 bit type + ASM_PRINTER_WIDTH64 = ASM_PRINTER_WIDTH16 << 1, }; } // namespace SPIRV diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp index e39666c..9aa07b5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp @@ -25,6 +25,8 @@ void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI, // Propagate previously set flags if (MI->getAsmPrinterFlags() & SPIRV::ASM_PRINTER_WIDTH16) OutMI.setFlags(SPIRV::INST_PRINTER_WIDTH16); + if (MI->getAsmPrinterFlags() & SPIRV::ASM_PRINTER_WIDTH64) + OutMI.setFlags(SPIRV::INST_PRINTER_WIDTH64); const MachineFunction *MF = MI->getMF(); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 4e2cc88..8f2fc01 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -105,6 +105,8 @@ void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) { uint32_t LowBits = FullImm & 0xffffffff; uint32_t HighBits = (FullImm >> 32) & 0xffffffff; MIB.addImm(LowBits).addImm(HighBits); + // Asm Printer needs this info to print 64-bit operands correctly + MIB.getInstr()->setAsmPrinterFlag(SPIRV::ASM_PRINTER_WIDTH64); return; } report_fatal_error("Unsupported constant bitwidth"); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index de28faf..3da720f 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1714,7 +1714,7 @@ SystemZTargetLowering::getRegForInlineAsmConstraint( } if (Constraint[1] == '@') { if (StringRef("{@cc}").compare(Constraint) == 0) - return std::make_pair(0u, &SystemZ::GR32BitRegClass); + return std::make_pair(SystemZ::CC, &SystemZ::CCRRegClass); } } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); @@ -1766,10 +1766,6 @@ SDValue SystemZTargetLowering::LowerAsmOutputForConstraint( OpInfo.ConstraintVT.getSizeInBits() < 8) report_fatal_error("Glue output operand is of invalid type"); - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(SystemZ::CC); - if (Glue.getNode()) { Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32, Glue); Chain = Glue.getValue(1); diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 706ab2b..51b540a 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -14,7 +14,10 @@ #ifndef LLVM_LIB_TARGET_X86_X86_H #define LLVM_LIB_TARGET_X86_X86_H +#include "llvm/IR/Analysis.h" +#include "llvm/IR/PassManager.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { @@ -162,7 +165,17 @@ FunctionPass *createX86WinEHUnwindV2Pass(); /// The pass transforms load/store <256 x i32> to AMX load/store intrinsics /// or split the data to two <128 x i32>. -FunctionPass *createX86LowerAMXTypePass(); +class X86LowerAMXTypePass : public PassInfoMixin<X86LowerAMXTypePass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXTypePass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp index d979517..2c0443d 100644 --- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp +++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp @@ -10,6 +10,7 @@ /// TODO: Port CodeGen passes to new pass manager. //===----------------------------------------------------------------------===// +#include "X86.h" #include "X86ISelDAGToDAG.h" #include "X86TargetMachine.h" diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 0ba71ad..8ffd454 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -46,12 +46,14 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -64,7 +66,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-type" +#define DEBUG_TYPE "x86-lower-amx-type" static bool isAMXCast(Instruction *II) { return match(II, @@ -137,7 +139,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { class ShapeCalculator { private: - TargetMachine *TM = nullptr; + const TargetMachine *TM = nullptr; // In AMX intrinsics we let Shape = {Row, Col}, but the // RealCol = Col / ElementSize. We may use the RealCol @@ -145,7 +147,7 @@ private: std::map<Value *, Value *> Col2Row, Row2Col; public: - ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {} + ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {} std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo); std::pair<Value *, Value *> getShape(PHINode *Phi); Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); @@ -1432,8 +1434,58 @@ bool X86LowerAMXCast::transformAllAMXCast() { return Change; } +bool lowerAmxType(Function &F, const TargetMachine *TM, + TargetLibraryInfo *TLI) { + // Performance optimization: most code doesn't use AMX, so return early if + // there are no instructions that produce AMX values. This is sufficient, as + // AMX arguments and constants are not allowed -- so any producer of an AMX + // value must be an instruction. + // TODO: find a cheaper way for this, without looking at all instructions. + if (!containsAMXCode(F)) + return false; + + bool C = false; + ShapeCalculator SC(TM); + X86LowerAMXCast LAC(F, &SC); + C |= LAC.combineAMXcast(TLI); + // There might be remaining AMXcast after combineAMXcast and they should be + // handled elegantly. + C |= LAC.transformAllAMXCast(); + + X86LowerAMXType LAT(F, &SC); + C |= LAT.visit(); + + // Prepare for fast register allocation at O0. + // Todo: May better check the volatile model of AMX code, not just + // by checking Attribute::OptimizeNone and CodeGenOptLevel::None. + if (TM->getOptLevel() == CodeGenOptLevel::None) { + // If Front End not use O0 but the Mid/Back end use O0, (e.g. + // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make + // sure the amx data is volatile, that is necessary for AMX fast + // register allocation. + if (!F.hasFnAttribute(Attribute::OptimizeNone)) { + X86VolatileTileData VTD(F); + C = VTD.volatileTileData() || C; + } + } + + return C; +} + } // anonymous namespace +PreservedAnalyses X86LowerAMXTypePass::run(Function &F, + FunctionAnalysisManager &FAM) { + TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); + bool Changed = lowerAmxType(F, TM, &TLI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} + namespace { class X86LowerAMXTypeLegacyPass : public FunctionPass { @@ -1443,44 +1495,10 @@ public: X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - // Performance optimization: most code doesn't use AMX, so return early if - // there are no instructions that produce AMX values. This is sufficient, as - // AMX arguments and constants are not allowed -- so any producer of an AMX - // value must be an instruction. - // TODO: find a cheaper way for this, without looking at all instructions. - if (!containsAMXCode(F)) - return false; - - bool C = false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); - - ShapeCalculator SC(TM); - X86LowerAMXCast LAC(F, &SC); - C |= LAC.combineAMXcast(TLI); - // There might be remaining AMXcast after combineAMXcast and they should be - // handled elegantly. - C |= LAC.transformAllAMXCast(); - - X86LowerAMXType LAT(F, &SC); - C |= LAT.visit(); - - // Prepare for fast register allocation at O0. - // Todo: May better check the volatile model of AMX code, not just - // by checking Attribute::OptimizeNone and CodeGenOptLevel::None. - if (TM->getOptLevel() == CodeGenOptLevel::None) { - // If Front End not use O0 but the Mid/Back end use O0, (e.g. - // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make - // sure the amx data is volatile, that is nessary for AMX fast - // register allocation. - if (!F.hasFnAttribute(Attribute::OptimizeNone)) { - X86VolatileTileData VTD(F); - C = VTD.volatileTileData() || C; - } - } - - return C; + return lowerAmxType(F, TM, TLI); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -1501,6 +1519,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXTypePass() { +FunctionPass *llvm::createX86LowerAMXTypeLegacyPass() { return new X86LowerAMXTypeLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index 3f2a433..fc25d55 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -12,11 +12,16 @@ // NOTE: NO INCLUDE GUARD DESIRED! +#ifndef FUNCTION_PASS +#define FUNCTION_PASS(NAME, CREATE_PASS) +#endif +FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +#undef FUNCTION_PASS + #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this)) DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 8dd6f3d..9a76abc 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -423,7 +423,7 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. addPass(createX86LowerAMXIntrinsicsPass()); - addPass(createX86LowerAMXTypePass()); + addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 6065575..c8d1938 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -369,6 +369,7 @@ getHostCPUNameForARMFromComponents(StringRef Implementer, StringRef Hardware, if (Implementer == "0x63") { // Arm China. return StringSwitch<const char *>(Part) .Case("0x132", "star-mc1") + .Case("0xd25", "star-mc3") .Default("generic"); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9b9fe26..f939e7a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1525,7 +1525,15 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { } // Try to extend the entire expression tree to the wide destination type. - if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) { + bool ShouldExtendExpression = true; + Value *TruncSrc = nullptr; + // It is not desirable to extend expression in the trunc + sext pattern when + // destination type is narrower than original (pre-trunc) type. + if (match(Src, m_Trunc(m_Value(TruncSrc)))) + if (TruncSrc->getType()->getScalarSizeInBits() > DestBitSize) + ShouldExtendExpression = false; + if (ShouldExtendExpression && shouldChangeType(SrcTy, DestTy) && + canEvaluateSExtd(Src, DestTy)) { // Okay, we can transform this! Insert the new expression now. LLVM_DEBUG( dbgs() << "ICE: EvaluateInDifferentType converting expression type" @@ -1545,13 +1553,18 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) { ShAmt); } - Value *X; - if (match(Src, m_Trunc(m_Value(X)))) { + Value *X = TruncSrc; + if (X) { // If the input has more sign bits than bits truncated, then convert // directly to final type. unsigned XBitSize = X->getType()->getScalarSizeInBits(); - if (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize) - return CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true); + bool HasNSW = cast<TruncInst>(Src)->hasNoSignedWrap(); + if (HasNSW || (ComputeNumSignBits(X, &Sext) > XBitSize - SrcBitSize)) { + auto *Res = CastInst::CreateIntegerCast(X, DestTy, /* isSigned */ true); + if (auto *ResTrunc = dyn_cast<TruncInst>(Res); ResTrunc && HasNSW) + ResTrunc->setHasNoSignedWrap(true); + return Res; + } // If input is a trunc from the destination type, then convert into shifts. if (Src->hasOneUse() && X->getType() == DestTy) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 651e305..550dfc5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -105,6 +105,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I, if (~KnownShrBits.Zero != ShlAmt) return nullptr; + IRBuilderBase::InsertPointGuard Guard(IC.Builder); + IC.Builder.SetInsertPoint(I); Value *ShrAmtZ = IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()), ShrAmt->getName() + ".z"); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 9c8de45..67f837c 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3358,21 +3358,21 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (TyAllocSize == 1) { // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X)) to (bitcast Y), - // but only if the result pointer is only used as if it were an integer, - // or both point to the same underlying object (otherwise provenance is - // not necessarily retained). + // but only if the result pointer is only used as if it were an integer. + // (The case where the underlying object is the same is handled by + // InstSimplify.) Value *X = GEP.getPointerOperand(); Value *Y; - if (match(GEP.getOperand(1), - m_Sub(m_PtrToInt(m_Value(Y)), m_PtrToInt(m_Specific(X)))) && + if (match(GEP.getOperand(1), m_Sub(m_PtrToIntOrAddr(m_Value(Y)), + m_PtrToIntOrAddr(m_Specific(X)))) && GEPType == Y->getType()) { - bool HasSameUnderlyingObject = - getUnderlyingObject(X) == getUnderlyingObject(Y); + bool HasNonAddressBits = + DL.getAddressSizeInBits(AS) != DL.getPointerSizeInBits(AS); bool Changed = false; GEP.replaceUsesWithIf(Y, [&](Use &U) { - bool ShouldReplace = HasSameUnderlyingObject || - isa<ICmpInst>(U.getUser()) || - isa<PtrToIntInst>(U.getUser()); + bool ShouldReplace = isa<PtrToAddrInst>(U.getUser()) || + (!HasNonAddressBits && + isa<ICmpInst, PtrToIntInst>(U.getUser())); Changed |= ShouldReplace; return ShouldReplace; }); diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index cb6ca72..7c364f8 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1539,7 +1539,7 @@ void AddressSanitizer::getInterestingMemoryOperands( IID == Intrinsic::experimental_vp_strided_load) { Stride = VPI->getOperand(PtrOpNo + 1); // Use the pointer alignment as the element alignment if the stride is a - // mutiple of the pointer alignment. Otherwise, the element alignment + // multiple of the pointer alignment. Otherwise, the element alignment // should be Align(1). unsigned PointerAlign = Alignment.valueOrOne().value(); if (!isa<ConstantInt>(Stride) || @@ -2399,7 +2399,7 @@ void ModuleAddressSanitizer::instrumentGlobalsELF( // Putting globals in a comdat changes the semantic and potentially cause // false negative odr violations at link time. If odr indicators are used, we - // keep the comdat sections, as link time odr violations will be dectected on + // keep the comdat sections, as link time odr violations will be detected on // the odr indicator symbols. bool UseComdatForGlobalsGC = UseOdrIndicator && !UniqueModuleId.empty(); @@ -3858,7 +3858,7 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) { I->eraseFromParent(); } - // Replace all uses of AddessReturnedByAlloca with NewAddressPtr. + // Replace all uses of AddressReturnedByAlloca with NewAddressPtr. AI->replaceAllUsesWith(NewAddressPtr); // We are done. Erase old alloca from parent. diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 72e8e50..0688bc7 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -359,7 +359,7 @@ class CHR { unsigned Count = 0; // Find out how many times region R is cloned. Note that if the parent // of R is cloned, R is also cloned, but R's clone count is not updated - // from the clone of the parent. We need to accumlate all the counts + // from the clone of the parent. We need to accumulate all the counts // from the ancestors to get the clone count. while (R) { Count += DuplicationCount[R]; @@ -1513,7 +1513,7 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp, BI->swapSuccessors(); // Don't need to swap this in terms of // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based - // mean whehter the branch is likely go into the if-then rather than + // mean whether the branch is likely go into the if-then rather than // successor0/successor1 and because we can tell which edge is the then or // the else one by comparing the destination to the region exit block. continue; diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index cf87e35..1e5946a 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -83,7 +83,7 @@ static cl::opt<unsigned> // ICP the candidate function even when only a declaration is present. static cl::opt<bool> ICPAllowDecls( "icp-allow-decls", cl::init(false), cl::Hidden, - cl::desc("Promote the target candidate even when the defintion " + cl::desc("Promote the target candidate even when the definition " " is not available")); // ICP hot candidate functions only. When setting to false, non-cold functions diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 5e7548b..7795cce 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -139,7 +139,7 @@ cl::opt<bool> ConditionalCounterUpdate( cl::init(false)); // If the option is not specified, the default behavior about whether -// counter promotion is done depends on how instrumentaiton lowering +// counter promotion is done depends on how instrumentation lowering // pipeline is setup, i.e., the default value of true of this option // does not mean the promotion will be done by default. Explicitly // setting this option can override the default behavior. @@ -1052,7 +1052,7 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { GlobalVariable *Name = Ind->getName(); auto It = ProfileDataMap.find(Name); assert(It != ProfileDataMap.end() && It->second.DataVar && - "value profiling detected in function with no counter incerement"); + "value profiling detected in function with no counter increment"); GlobalVariable *DataVar = It->second.DataVar; uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp index 3c0f185..05616d8 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp @@ -490,7 +490,7 @@ void createProfileFileNameVar(Module &M) { } } -// Set MemprofHistogramFlag as a Global veriable in IR. This makes it accessible +// Set MemprofHistogramFlag as a Global variable in IR. This makes it accessible // to the runtime, changing shadow count behavior. void createMemprofHistogramFlagVar(Module &M) { const StringRef VarName(MemProfHistogramFlagVar); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 10b03bb..471c6ec 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3136,7 +3136,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// If we don't instrument it and it gets inlined, /// our interceptor will not kick in and we will lose the memmove. /// If we instrument the call here, but it does not get inlined, - /// we will memove the shadow twice: which is bad in case + /// we will memmove the shadow twice: which is bad in case /// of overlapping regions. So, we simply lower the intrinsic to a call. /// /// Similar situation exists for memcpy and memset. @@ -4775,7 +4775,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // _mm_round_ps / _mm_round_ps. // Similar to maybeHandleSimpleNomemIntrinsic except - // the second argument is guranteed to be a constant integer. + // the second argument is guaranteed to be a constant integer. void handleRoundPdPsIntrinsic(IntrinsicInst &I) { assert(I.getArgOperand(0)->getType() == I.getType()); assert(I.arg_size() == 2); diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp index f5b6686..5f87ed6 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfFlattening.cpp @@ -176,7 +176,7 @@ PreservedAnalyses PGOCtxProfFlatteningPass::run(Module &M, assert(areAllBBsReachable( F, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M) .getManager()) && - "Function has unreacheable basic blocks. The expectation was that " + "Function has unreachable basic blocks. The expectation was that " "DCE was run before."); auto It = FlattenedProfile.find(AssignGUIDPass::getGUID(F)); diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index 0a358d4..de7c169 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -253,7 +253,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { Value *RealContext = nullptr; StructType *ThisContextType = nullptr; - Value *TheRootFuctionData = nullptr; + Value *TheRootFunctionData = nullptr; Value *ExpectedCalleeTLSAddr = nullptr; Value *CallsiteInfoTLSAddr = nullptr; const bool HasMusttail = [&F]() { @@ -283,7 +283,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { Guid = Builder.getInt64( AssignGUIDPass::getGUID(cast<Function>(*Mark->getNameValue()))); // The type of the context of this function is now knowable since we have - // NumCallsites and NumCounters. We delcare it here because it's more + // NumCallsites and NumCounters. We declare it here because it's more // convenient - we have the Builder. ThisContextType = StructType::get( F.getContext(), @@ -291,28 +291,27 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { ArrayType::get(Builder.getPtrTy(), NumCallsites)}); // Figure out which way we obtain the context object for this function - // if it's an entrypoint, then we call StartCtx, otherwise GetCtx. In the - // former case, we also set TheRootFuctionData since we need to release it - // at the end (plus it can be used to know if we have an entrypoint or a - // regular function) - // Don't set a name, they end up taking a lot of space and we don't need - // them. + // former case, we also set TheRootFunctionData since we need to release + // it at the end (plus it can be used to know if we have an entrypoint or + // a regular function). Don't set a name, they end up taking a lot of + // space and we don't need them. // Zero-initialize the FunctionData, except for functions that have // musttail calls. There, we set the CtxRoot field to 1, which will be // treated as a "can't be set as root". - TheRootFuctionData = new GlobalVariable( + TheRootFunctionData = new GlobalVariable( M, FunctionDataTy, false, GlobalVariable::InternalLinkage, HasMusttail ? CannotBeRootInitializer : Constant::getNullValue(FunctionDataTy)); if (ContextRootSet.contains(&F)) { Context = Builder.CreateCall( - StartCtx, {TheRootFuctionData, Guid, Builder.getInt32(NumCounters), + StartCtx, {TheRootFunctionData, Guid, Builder.getInt32(NumCounters), Builder.getInt32(NumCallsites)}); ORE.emit( [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); }); } else { - Context = Builder.CreateCall(GetCtx, {TheRootFuctionData, &F, Guid, + Context = Builder.CreateCall(GetCtx, {TheRootFunctionData, &F, Guid, Builder.getInt32(NumCounters), Builder.getInt32(NumCallsites)}); ORE.emit([&] { @@ -399,7 +398,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { } else if (!HasMusttail && isa<ReturnInst>(I)) { // Remember to release the context if we are an entrypoint. IRBuilder<> Builder(&I); - Builder.CreateCall(ReleaseCtx, {TheRootFuctionData}); + Builder.CreateCall(ReleaseCtx, {TheRootFunctionData}); ContextWasReleased = true; } } diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 120c4f6..71736cf 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1957,7 +1957,7 @@ static bool InstrumentAllFunctions( function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, function_ref<LoopInfo *(Function &)> LookupLI, PGOInstrumentationType InstrumentationType) { - // For the context-sensitve instrumentation, we should have a separated pass + // For the context-sensitive instrumentation, we should have a separated pass // (before LTO/ThinLTO linking) to create these variables. if (InstrumentationType == PGOInstrumentationType::FDO) createIRLevelProfileFlagVar(M, InstrumentationType); @@ -2248,7 +2248,7 @@ static bool annotateAllFunctions( Func.populateCoverage(); continue; } - // When PseudoKind is set to a vaule other than InstrProfRecord::NotPseudo, + // When PseudoKind is set to a value other than InstrProfRecord::NotPseudo, // it means the profile for the function is unrepresentative and this // function is actually hot / warm. We will reset the function hot / cold // attribute and drop all the profile counters. diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp index 4801ac7..210b126 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerBinaryMetadata.cpp @@ -481,15 +481,18 @@ StringRef SanitizerBinaryMetadata::getSectionEnd(StringRef SectionSuffix) { } // namespace SanitizerBinaryMetadataPass::SanitizerBinaryMetadataPass( - SanitizerBinaryMetadataOptions Opts, ArrayRef<std::string> IgnorelistFiles) - : Options(std::move(Opts)), IgnorelistFiles(std::move(IgnorelistFiles)) {} + SanitizerBinaryMetadataOptions Opts, + IntrusiveRefCntPtr<vfs::FileSystem> VFS, + ArrayRef<std::string> IgnorelistFiles) + : Options(std::move(Opts)), + VFS(VFS ? std::move(VFS) : vfs::getRealFileSystem()), + IgnorelistFiles(std::move(IgnorelistFiles)) {} PreservedAnalyses SanitizerBinaryMetadataPass::run(Module &M, AnalysisManager<Module> &AM) { std::unique_ptr<SpecialCaseList> Ignorelist; if (!IgnorelistFiles.empty()) { - Ignorelist = SpecialCaseList::createOrDie(IgnorelistFiles, - *vfs::getRealFileSystem()); + Ignorelist = SpecialCaseList::createOrDie(IgnorelistFiles, *VFS); if (Ignorelist->inSection("metadata", "src", M.getSourceFileName())) return PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index b74a070..09abf6a 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -318,6 +318,18 @@ private: }; } // namespace +SanitizerCoveragePass::SanitizerCoveragePass( + SanitizerCoverageOptions Options, IntrusiveRefCntPtr<vfs::FileSystem> VFS, + const std::vector<std::string> &AllowlistFiles, + const std::vector<std::string> &BlocklistFiles) + : Options(std::move(Options)), + VFS(VFS ? std::move(VFS) : vfs::getRealFileSystem()) { + if (AllowlistFiles.size() > 0) + Allowlist = SpecialCaseList::createOrDie(AllowlistFiles, *this->VFS); + if (BlocklistFiles.size() > 0) + Blocklist = SpecialCaseList::createOrDie(BlocklistFiles, *this->VFS); +} + PreservedAnalyses SanitizerCoveragePass::run(Module &M, ModuleAnalysisManager &MAM) { auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 0d48a35..fd0e9f1 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -355,7 +355,7 @@ static bool isVtableAccess(Instruction *I) { } // Do not instrument known races/"benign races" that come from compiler -// instrumentatin. The user has no way of suppressing them. +// instrumentation. The user has no way of suppressing them. static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) { // Peel off GEPs and BitCasts. Addr = Addr->stripInBoundsOffsets(); diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp index 9471ae3..78d4a57e 100644 --- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp @@ -643,7 +643,7 @@ bool TypeSanitizer::instrumentWithShadowUpdate( // doesn't match, then we call the runtime (which may yet determine that // the mismatch is okay). // - // The checks generated below have the following strucutre. + // The checks generated below have the following structure. // // ; First we load the descriptor for the load from shadow memory and // ; compare it against the type descriptor for the current access type. diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 371d9e6..a9ab3b3 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -819,7 +819,7 @@ public: OS << "]"; } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) { OS << "; switch predicate info { CaseValue: " << *PS->CaseValue - << " Switch:" << *PS->Switch << " Edge: ["; + << " Edge: ["; PS->From->printAsOperand(OS); OS << ","; PS->To->printAsOperand(OS); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index d831c27..c537be5c 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7551,6 +7551,7 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, /// log2(C)-indexed value table (instead of traditionally emitting a load of the /// address of the jump target, and indirectly jump to it). static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, + DomTreeUpdater *DTU, const DataLayout &DL, const TargetTransformInfo &TTI) { Value *Condition = SI->getCondition(); @@ -7573,12 +7574,6 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, if (SI->getNumCases() < 4) return false; - // We perform this optimization only for switches with - // unreachable default case. - // This assumtion will save us from checking if `Condition` is a power of two. - if (!SI->defaultDestUnreachable()) - return false; - // Check that switch cases are powers of two. SmallVector<uint64_t, 4> Values; for (const auto &Case : SI->cases()) { @@ -7598,6 +7593,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, Builder.SetInsertPoint(SI); + if (!SI->defaultDestUnreachable()) { + // Let non-power-of-two inputs jump to the default case, when the latter is + // reachable. + auto *PopC = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, Condition); + auto *IsPow2 = Builder.CreateICmpEQ(PopC, ConstantInt::get(CondTy, 1)); + + auto *OrigBB = SI->getParent(); + auto *DefaultCaseBB = SI->getDefaultDest(); + BasicBlock *SplitBB = SplitBlock(OrigBB, SI, DTU); + auto It = OrigBB->getTerminator()->getIterator(); + BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It); + It->eraseFromParent(); + + addPredecessorToBlock(DefaultCaseBB, OrigBB, SplitBB); + if (DTU) + DTU->applyUpdates({{DominatorTree::Insert, OrigBB, DefaultCaseBB}}); + } + // Replace each case with its trailing zeros number. for (auto &Case : SI->cases()) { auto *OrigValue = Case.getCaseValue(); @@ -7953,7 +7966,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { Options.ConvertSwitchToLookupTable)) return requestResimplify(); - if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI)) + if (simplifySwitchOfPowersOfTwo(SI, Builder, DTU, DL, TTI)) return requestResimplify(); if (reduceSwitchRange(SI, Builder, DL, TTI)) |
