diff options
Diffstat (limited to 'llvm/lib/Target')
23 files changed, 406 insertions, 102 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6965116..9926a4d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26196,9 +26196,10 @@ static SDValue performFlagSettingCombine(SDNode *N, return DCI.CombineTo(N, Res, SDValue(N, 1)); } - // Combine identical generic nodes into this node, re-using the result. + // Combine equivalent generic nodes into this node, re-using the result. if (SDNode *Generic = DCI.DAG.getNodeIfExists( - GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS})) + GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}, + /*AllowCommute=*/true)) DCI.CombineTo(Generic, SDValue(N, 0)); return SDValue(); diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index f110558..7e03b97 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() { } bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes); - // Assume we can't combine the last pop with the sp restore. - bool CombineAfterCSRBump = false; + + unsigned ProloguePopSize = PrologueSaveSize; if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack + // that needs to be popped until we reach the start of the SVE save area. + // The "FixedObject" stack occurs after the SVE area and must be popped + // later. + ProloguePopSize -= FixedObject; AfterCSRPopSize += FixedObject; - } else if (!CombineSPBump && PrologueSaveSize != 0) { + } + + // Assume we can't combine the last pop with the sp restore. + if (!CombineSPBump && ProloguePopSize != 0) { MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION || - AArch64InstrInfo::isSEHInstruction(*Pop)) + AArch64InstrInfo::isSEHInstruction(*Pop) || + (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord && + isPartOfSVECalleeSaves(Pop))) Pop = std::prev(Pop); // Converting the last ldp to a post-index ldp is valid only if the last // ldp's offset is 0. @@ -1377,18 +1387,27 @@ void AArch64EpilogueEmitter::emitEpilogue() { // may clobber), convert it to a post-index ldp. if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) { convertCalleeSaveRestoreToSPPrePostIncDec( - Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy, - PrologueSaveSize); + Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy, + ProloguePopSize); + } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + MachineBasicBlock::iterator AfterLastPop = std::next(Pop); + if (AArch64InstrInfo::isSEHInstruction(*AfterLastPop)) + ++AfterLastPop; + // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate + // callee-save non-SVE registers to move the stack pointer to the start of + // the SVE area. + emitFrameOffset(MBB, AfterLastPop, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(ProloguePopSize), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); } else { - // If not, make sure to emit an add after the last ldp. + // Otherwise, make sure to emit an add after the last ldp. // We're doing this by transferring the size to be restored from the // adjustment *before* the CSR pops to the adjustment *after* the CSR // pops. - AfterCSRPopSize += PrologueSaveSize; - CombineAfterCSRBump = true; + AfterCSRPopSize += ProloguePopSize; } } - // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. @@ -1419,6 +1438,17 @@ void AArch64EpilogueEmitter::emitEpilogue() { --SEHEpilogueStartI; } + // Determine the ranges of SVE callee-saves. This is done before emitting any + // code at the end of the epilogue (for Swift async), which can get in the way + // of finding SVE callee-saves with CalleeSavesAboveFrameRecord. + auto [PPR, ZPR] = getSVEStackFrameSizes(); + auto [PPRRange, ZPRRange] = partitionSVECS( + MBB, + SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord + ? MBB.getFirstTerminator() + : FirstGPRRestoreI, + PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true); + if (HasFP && AFI->hasSwiftAsyncContext()) emitSwiftAsyncContextFramePointer(EpilogueEndI, DL); @@ -1441,14 +1471,6 @@ void AArch64EpilogueEmitter::emitEpilogue() { NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); - auto [PPR, ZPR] = getSVEStackFrameSizes(); - auto [PPRRange, ZPRRange] = partitionSVECS( - MBB, - SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord - ? MBB.getFirstTerminator() - : FirstGPRRestoreI, - PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true); - StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; StackOffset SVEStackSize = SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize; @@ -1467,16 +1489,6 @@ void AArch64EpilogueEmitter::emitEpilogue() { NeedsWinCFI, &HasWinCFI); } - // Deallocate callee-save non-SVE registers. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - - // Deallocate fixed objects. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(FixedObject), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - // Deallocate callee-save SVE registers. emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false, @@ -1619,7 +1631,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI, - StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0)); + StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore)); } } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 12ddf47..53b00e8 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -273,7 +273,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { EpilogueVectorizationMinVF = 8; MaxInterleaveFactor = 4; ScatterOverhead = 13; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NeoverseN2: case NeoverseN3: PrefFunctionAlignment = Align(16); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a44af5f..1b559a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2833,8 +2833,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, R = getMad(DAG, DL, VT, YH, CH, Mad1); } - const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && - (Flags.hasNoInfs() || Options.NoInfsFPMath); + const bool IsFiniteOnly = + (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs(); // TODO: Check if known finite from source value. if (!IsFiniteOnly) { @@ -3161,9 +3161,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT); R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R); - const auto &Options = getTargetMachine().Options; - if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) { + if (!Flags.hasNoInfs()) { SDValue OverflowCheckConst = DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT); SDValue Overflow = diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index ee466ca..596a895 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3575,7 +3575,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, const bool IsFiniteOnly = (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && - (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); + MI.getFlag(MachineInstr::FmNoInfs); if (!IsFiniteOnly) { // Expand isfinite(x) => fabs(x) < inf @@ -3864,9 +3864,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, R = B.buildSelect(Ty, Underflow, Zero, R); - const auto &Options = MF.getTarget().Options; - - if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { + if (!(Flags & MachineInstr::FmNoInfs)) { auto OverflowCheckConst = B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 71494be..4e11c4f 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -14,6 +14,7 @@ #include "GCNRegPressure.h" #include "AMDGPU.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/RegisterPressure.h" using namespace llvm; @@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI, + GCNRegPressure::RegKind RegKind) { GCNRPTracker::LiveRegSet LiveRegs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { auto Reg = Register::index2VirtReg(I); + if (RegKind != GCNRegPressure::TOTAL_KINDS && + GCNRegPressure::getRegKind(Reg, MRI) != RegKind) + continue; if (!LIS.hasInterval(Reg)) continue; auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); @@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { #undef PFX } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF, + GCNRegPressure::RegKind Kind, + LiveIntervals &LIS, + const MachineLoopInfo *MLI) { + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + auto &OS = dbgs(); + const char *RegName = GCNRegPressure::getName(Kind); + + unsigned MaxNumRegs = 0; + const MachineInstr *MaxPressureMI = nullptr; + GCNUpwardRPTracker RPT(LIS); + for (const MachineBasicBlock &MBB : MF) { + RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot()); + for (const MachineInstr &MI : reverse(MBB)) { + RPT.recede(MI); + unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind); + if (NumRegs > MaxNumRegs) { + MaxNumRegs = NumRegs; + MaxPressureMI = &MI; + } + } + } + + SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI); + + // Max pressure can occur at either the early-clobber or register slot. + // Choose the maximum liveset between both slots. This is ugly but this is + // diagnostic code. + SlotIndex ECSlot = MISlot.getRegSlot(true); + SlotIndex RSlot = MISlot.getRegSlot(false); + GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind); + GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind); + unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind); + unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind); + GCNRPTracker::LiveRegSet *LiveSet = + ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet; + SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot; + assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs); + + // Split live registers into single-def and multi-def sets. + GCNRegPressure SDefPressure, MDefPressure; + SmallVector<Register, 16> SDefRegs, MDefRegs; + for (auto [Reg, LaneMask] : *LiveSet) { + assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind); + LiveInterval &LI = LIS.getInterval(Reg); + if (LI.getNumValNums() == 1 || + (LI.hasSubRanges() && + llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) { + return SR.getNumValNums() == 1; + }))) { + SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI); + SDefRegs.push_back(Reg); + } else { + MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI); + MDefRegs.push_back(Reg); + } + } + unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind); + unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind); + assert(SDefNumRegs + MDefNumRegs == MaxNumRegs); + + auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) { + return Printable([&, MBB, SI](raw_ostream &OS) { + OS << SI << ':' << printMBBReference(*MBB); + if (MLI) + if (const MachineLoop *ML = MLI->getLoopFor(MBB)) + OS << " (LoopHdr " << printMBBReference(*ML->getHeader()) + << ", Depth " << ML->getLoopDepth() << ")"; + }); + }; + + auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) { + GCNRegPressure RegPressure; + RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI); + OS << " " << printReg(Reg, TRI) << ':' + << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask " + << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' ' + << RegName << "s)\n"; + + // Use std::map to sort def/uses by SlotIndex. + std::map<SlotIndex, const MachineInstr *> Instrs; + for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) { + Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI; + } + + for (const auto &[SI, MI] : Instrs) { + OS << " "; + if (MI->definesRegister(Reg, TRI)) + OS << "def "; + if (MI->readsRegister(Reg, TRI)) + OS << "use "; + OS << printLoc(MI->getParent(), SI) << ": " << *MI; + } + }; + + OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName() + << " ***\n"; + OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at " + << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": " + << *MaxPressureMI; + + OS << "\nLive registers with single definition (" << SDefNumRegs << ' ' + << RegName << "s):\n"; + + // Sort SDefRegs by number of uses (smallest first) + llvm::sort(SDefRegs, [&](Register A, Register B) { + return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) < + std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end()); + }); + + for (const Register Reg : SDefRegs) { + PrintRegInfo(Reg, LiveSet->lookup(Reg)); + } + + OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' ' + << RegName << "s):\n"; + for (const Register Reg : MDefRegs) { + PrintRegInfo(Reg, LiveSet->lookup(Reg)); + } +} +#endif diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 898d1ff..979a8b0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -31,6 +31,12 @@ class SlotIndex; struct GCNRegPressure { enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; + static constexpr const char *getName(RegKind Kind) { + const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"}; + assert(Kind < TOTAL_KINDS); + return Names[Kind]; + } + GCNRegPressure() { clear(); } @@ -41,6 +47,11 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + unsigned getNumRegs(RegKind Kind) const { + assert(Kind < TOTAL_KINDS); + return Value[Kind]; + } + /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure @@ -138,6 +149,12 @@ struct GCNRegPressure { void dump() const; + static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI); + return (RegKind)getRegKind(MRI.getRegClass(Reg), STI); + } + private: static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2; @@ -294,8 +311,10 @@ public: } }; -GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI); +GCNRPTracker::LiveRegSet +getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, + GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS); //////////////////////////////////////////////////////////////////////////////// // GCNUpwardRPTracker @@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, const MachineRegisterInfo &MRI, LaneBitmask LaneMaskFilter = LaneBitmask::getAll()); -GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI); - /// creates a map MachineInstr -> LiveRegSet /// R - range of iterators on instructions /// After - upon entry or exit of every instruction @@ -524,6 +540,11 @@ public: } }; +LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, + GCNRegPressure::RegKind Kind, + LiveIntervals &LIS, + const MachineLoopInfo *MLI); + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index bdc0810..58482ea 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +#define DUMP_MAX_REG_PRESSURE +static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler( + "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, + cl::desc("Print a list of live registers along with their def/uses at the " + "point of maximum register pressure before scheduling."), + cl::init(false)); + +static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler( + "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, + cl::desc("Print a list of live registers along with their def/uses at the " + "point of maximum register pressure after scheduling."), + cl::init(false)); +#endif + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() { RegionLiveOuts.buildLiveRegMap(); } +#ifdef DUMP_MAX_REG_PRESSURE + if (PrintMaxRPRegUsageBeforeScheduler) { + dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); + dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); + LIS->dump(); + } +#endif + GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); while (S.advanceStage()) { auto Stage = createSchedStage(S.getCurrentStage()); @@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() { Stage->finalizeGCNSchedStage(); } + +#ifdef DUMP_MAX_REG_PRESSURE + if (PrintMaxRPRegUsageAfterScheduler) { + dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); + dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); + LIS->dump(); + } +#endif } #ifndef NDEBUG diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 5e27b37..6dcbced 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1019,7 +1019,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // SMEM and VMEM operations. So there will never be // outstanding address translations for both SMEM and // VMEM at the same time. - setScoreLB(T, CurrScore - 1); + setScoreLB(T, getScoreUB(T) - 1); PendingEvents &= ~(1 << OtherEvent); } for (const MachineOperand &Op : Inst.all_uses()) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index eac9fd4..27e5ee9c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3726,6 +3726,23 @@ def : GCNPat < } // End foreach Ty = ... } // End AddedComplexity = 1 +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat< + (i32 (DivergentBinFrag<or> + (i32 (zext i16:$src_lo)), + (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))) + )), + (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16) +>; +def : GCNPat< + (i32 (DivergentBinFrag<or> + (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))), + (i32 (zext i16:$src_lo)) + )), + (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16) +>; +} + let True16Predicate = UseRealTrue16Insts in def : GCNPat < (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))), diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index be1c883..ebd2e7e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2356,7 +2356,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); - LLVM_FALLTHROUGH; + [[fallthrough]]; } case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: @@ -2446,7 +2446,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); - LLVM_FALLTHROUGH; + [[fallthrough]]; } case AMDGPU::SI_SPILL_V16_RESTORE: case AMDGPU::SI_SPILL_V32_RESTORE: diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 7ddf996..f7deeaf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -632,7 +632,7 @@ SDValue LoongArchTargetLowering::lowerConstantFP(SDValue Op, case MVT::f32: { SDValue NewVal = DAG.getConstant(INTVal, DL, MVT::i32); if (Subtarget.is64Bit()) - NewVal = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, NewVal); + NewVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, NewVal); return DAG.getNode(Subtarget.is64Bit() ? LoongArchISD::MOVGR2FR_W_LA64 : LoongArchISD::MOVGR2FR_W, DL, VT, NewVal); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index bef4868..7e7ee75 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -280,6 +280,10 @@ static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) { } void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) { + if (!Subtarget->hasTcgen05InstSupport()) + report_fatal_error( + "tcgen05.ld is not supported on this architecture variant"); + SDLoc DL(N); unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); @@ -2136,6 +2140,10 @@ static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) { } void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) { + if (!Subtarget->hasTcgen05InstSupport()) + report_fatal_error( + "tcgen05.st is not supported on this architecture variant"); + SDLoc DL(N); unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index a1fb665..272c21f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -233,7 +233,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; - LLVM_FALLTHROUGH; + [[fallthrough]]; case MVT::v2i8: case MVT::v2i64: case MVT::v2f64: @@ -248,7 +248,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, // global and the target supports 256-bit loads/stores. if (!CanLowerTo256Bit) return std::nullopt; - LLVM_FALLTHROUGH; + [[fallthrough]]; case MVT::v2i16: // <1 x i16x2> case MVT::v2f16: // <1 x f16x2> case MVT::v2bf16: // <1 x bf16x2> @@ -270,7 +270,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; - LLVM_FALLTHROUGH; + [[fallthrough]]; case MVT::v2f32: // <1 x f32x2> case MVT::v4f32: // <2 x f32x2> case MVT::v2i32: // <1 x i32x2> @@ -6749,7 +6749,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::BinOp::Xchg: if (BitWidth == 128) return AtomicExpansionKind::None; - LLVM_FALLTHROUGH; + [[fallthrough]]; case AtomicRMWInst::BinOp::And: case AtomicRMWInst::BinOp::Or: case AtomicRMWInst::BinOp::Xor: diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 6c14cf0..dfde0cc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -101,6 +101,22 @@ def PrmtMode : Operand<i32> { // NVPTX Instruction Predicate Definitions //===----------------------------------------------------------------------===// +// Checks PTX version and family-specific and architecture-specific SM versions. +// For example, sm_100{f/a} and any future variants in the same family will match +// for any PTX version greater than or equal to `PTXVersion`. +class PTXWithFamilySMs<int PTXVersion, list<int> SMVersions> : + Predicate<"Subtarget->hasPTXWithFamilySMs(" # PTXVersion # ", {" # + !interleave(SMVersions, ", ") # "})">; + +// Checks PTX version and architecture-specific SM versions. +// For example, sm_100{a} will match for any PTX version +// greater than or equal to `PTXVersion`. +class PTXWithAccelSMs<int PTXVersion, list<int> SMVersions> : + Predicate<"Subtarget->hasPTXWithAccelSMs(" # PTXVersion # ", {" # + !interleave(SMVersions, ", ") # "})">; + +// Helper predicate to call a subtarget method. +class callSubtarget<string SubtargetMethod> : Predicate<"Subtarget->" # SubtargetMethod # "()">; def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index a8b854f..22cf3a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -5103,8 +5103,8 @@ let Predicates = [hasSM<90>, hasPTX<78>] in { def EXIT : NullaryInst<"exit", int_nvvm_exit>; // Tcgen05 intrinsics -let isConvergent = true, Predicates = [hasTcgen05Instructions] in { - +let isConvergent = true in { +let Predicates = [callSubtarget<"hasTcgen05InstSupport">] in { multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> { def "" : BasicNVPTXInst<(outs), (ins ADDR:$dst, B32:$ncols), @@ -5156,15 +5156,6 @@ defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">; defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">; defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">; -multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> { - def "" : BasicNVPTXInst<(outs), - (ins ADDR:$tmem_addr), - "tcgen05.shift.cta_group::" # num # ".down", - [(Intr addr:$tmem_addr)]>; -} -defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; -defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; - multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> { defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16"); defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret; @@ -5195,9 +5186,22 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">; defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">; } +} // Predicates + +let Predicates = [callSubtarget<"hasTcgen05ShiftSupport">] in { +multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> { + def "" : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr), + "tcgen05.shift.cta_group::" # num # ".down", + [(Intr addr:$tmem_addr)]>; +} +defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; +defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; +} // Predicates + } // isConvergent -let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in { +let hasSideEffects = 1, Predicates = [callSubtarget<"hasTcgen05InstSupport">] in { def tcgen05_fence_before_thread_sync: NullaryInst< "tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>; @@ -5231,8 +5235,7 @@ class TCGEN05_LDST_REGINFO<int Veclen> { // class TCGEN05_LD_INST<string Shape, int Num, bit Pack> : - NVPTXInst<(outs), (ins), "?", []>, - Requires<[hasTcgen05Instructions]> { + NVPTXInst<(outs), (ins), "?", []> { TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO< NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>; @@ -5256,8 +5259,7 @@ class TCGEN05_LD_INST<string Shape, int Num, bit Pack> : // class TCGEN05_ST_INST<string Shape, int Num, bit Unpack> : - NVPTXInst<(outs), (ins), "?", []>, - Requires<[hasTcgen05Instructions]> { + NVPTXInst<(outs), (ins), "?", []> { TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO< NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index c548967..989be50 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -72,6 +72,40 @@ const SelectionDAGTargetInfo *NVPTXSubtarget::getSelectionDAGInfo() const { return TSInfo.get(); } +bool NVPTXSubtarget::hasPTXWithFamilySMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const { + unsigned PTXVer = getPTXVersion(); + if (!hasFamilySpecificFeatures() || PTXVer < PTXVersion) + return false; + + unsigned SMVer = getSmVersion(); + return llvm::any_of(SMVersions, [&](unsigned SM) { + // sm_101 is a different family, never group it with sm_10x. + if (SMVer == 101 || SM == 101) + return SMVer == SM && + // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not + // supported. + !(PTXVer >= 90 && SMVer == 101); + + return getSmFamilyVersion() == SM / 10 && SMVer >= SM; + }); +} + +bool NVPTXSubtarget::hasPTXWithAccelSMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const { + unsigned PTXVer = getPTXVersion(); + if (!hasArchAccelFeatures() || PTXVer < PTXVersion) + return false; + + unsigned SMVer = getSmVersion(); + return llvm::any_of(SMVersions, [&](unsigned SM) { + return SMVer == SM && + // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not + // supported. + !(PTXVer >= 90 && SMVer == 101); + }); +} + bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index e81c56b..194dbdc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -73,6 +73,18 @@ public: const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; + // Checks PTX version and family-specific and architecture-specific SM + // versions. For example, sm_100{f/a} and any future variants in the same + // family will match for any PTX version greater than or equal to + // `PTXVersion`. + bool hasPTXWithFamilySMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const; + // Checks PTX version and architecture-specific SM versions. + // For example, sm_100{a} will match for any PTX version greater than or equal + // to `PTXVersion`. + bool hasPTXWithAccelSMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const; + bool has256BitVectorLoadStore(unsigned AS) const { return SmVersion >= 100 && PTXVersion >= 88 && AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; @@ -127,6 +139,27 @@ public: return HasTcgen05 && PTXVersion >= MinPTXVersion; } + // Checks following instructions support: + // - tcgen05.ld/st + // - tcgen05.alloc/dealloc/relinquish + // - tcgen05.cp + // - tcgen05.fence/wait + // - tcgen05.commit + bool hasTcgen05InstSupport() const { + // sm_101 renamed to sm_110 in PTX 9.0 + return hasPTXWithFamilySMs(90, {100, 110}) || + hasPTXWithFamilySMs(88, {100, 101}) || + hasPTXWithAccelSMs(86, {100, 101}); + } + + // Checks tcgen05.shift instruction support. + bool hasTcgen05ShiftSupport() const { + // sm_101 renamed to sm_110 in PTX 9.0 + return hasPTXWithAccelSMs(90, {100, 110, 103}) || + hasPTXWithAccelSMs(88, {100, 101, 103}) || + hasPTXWithAccelSMs(86, {100, 101}); + } + bool hasTcgen05MMAScaleInputDImm() const { return FullSmVersion == 1003 && PTXVersion >= 86; } @@ -158,6 +191,7 @@ public: bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } + unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; } // GPUs with "a" suffix have architecture-accelerated features that are // supported on the specified architecture only, hence such targets do not // follow the onion layer model. hasArchAccelFeatures() allows distinguishing diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 023fd14..bcb3f50 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2404,7 +2404,7 @@ void PPCAIXAsmPrinter::emitTracebackTable() { << static_cast<unsigned>(((V) & (TracebackTable::Field##Mask)) >> \ (TracebackTable::Field##Shift)) - GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsGlobaLinkage); + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsGlobalLinkage); GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsOutOfLineEpilogOrPrologue); EmitComment(); diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 21dbb7c..4b54231 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1688,7 +1688,7 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, (1 << 25) - 1); // HACK: See comment before `BareSymbolQC_E_LI` in RISCVInstrInfoXqci.td. case Match_InvalidBareSymbolQC_E_LI: - LLVM_FALLTHROUGH; + [[fallthrough]]; // END HACK case Match_InvalidBareSImm32: return generateImmOutOfRangeError(Operands, ErrorInfo, diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index 27fba34..100f1ec 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -1164,14 +1164,13 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, I.getOpcode() == TargetOpcode::G_USUBO) && "unexpected instruction"); - const Register DstReg = I.getOperand(0).getReg(); - const Register CarryOutReg = I.getOperand(1).getReg(); - const Register Op0Reg = I.getOperand(2).getReg(); - const Register Op1Reg = I.getOperand(3).getReg(); - bool IsSub = I.getOpcode() == TargetOpcode::G_USUBE || - I.getOpcode() == TargetOpcode::G_USUBO; - bool HasCarryIn = I.getOpcode() == TargetOpcode::G_UADDE || - I.getOpcode() == TargetOpcode::G_USUBE; + auto &CarryMI = cast<GAddSubCarryOut>(I); + + const Register DstReg = CarryMI.getDstReg(); + const Register CarryOutReg = CarryMI.getCarryOutReg(); + const Register Op0Reg = CarryMI.getLHSReg(); + const Register Op1Reg = CarryMI.getRHSReg(); + bool IsSub = CarryMI.isSub(); const LLT DstTy = MRI.getType(DstReg); assert(DstTy.isScalar() && "selectUAddSub only supported for scalar types"); @@ -1207,14 +1206,15 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, llvm_unreachable("selectUAddSub unsupported type."); } - const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB); + const RegisterBank &CarryRB = *RBI.getRegBank(CarryOutReg, MRI, TRI); + const TargetRegisterClass *CarryRC = + getRegClass(MRI.getType(CarryOutReg), CarryRB); unsigned Opcode = IsSub ? OpSUB : OpADD; // G_UADDE/G_USUBE - find CarryIn def instruction. - if (HasCarryIn) { - Register CarryInReg = I.getOperand(4).getReg(); + if (auto CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) { + Register CarryInReg = CarryInMI->getCarryInReg(); MachineInstr *Def = MRI.getVRegDef(CarryInReg); while (Def->getOpcode() == TargetOpcode::G_TRUNC) { CarryInReg = Def->getOperand(1).getReg(); @@ -1227,11 +1227,12 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, Def->getOpcode() == TargetOpcode::G_USUBE || Def->getOpcode() == TargetOpcode::G_USUBO) { // carry set by prev ADD/SUB. - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), - X86::EFLAGS) - .addReg(CarryInReg); - if (!RBI.constrainGenericRegister(CarryInReg, *DstRC, MRI)) + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::CMP8ri)) + .addReg(CarryInReg) + .addImm(1); + + if (!RBI.constrainGenericRegister(CarryInReg, *CarryRC, MRI)) return false; Opcode = IsSub ? OpSBB : OpADC; @@ -1250,11 +1251,11 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, .addReg(Op0Reg) .addReg(Op1Reg); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), CarryOutReg) - .addReg(X86::EFLAGS); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), CarryOutReg) + .addImm(X86::COND_B); if (!constrainSelectedInstRegOperands(Inst, TII, TRI, RBI) || - !RBI.constrainGenericRegister(CarryOutReg, *DstRC, MRI)) + !RBI.constrainGenericRegister(CarryOutReg, *CarryRC, MRI)) return false; I.eraseFromParent(); diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index 11ef721..28fa2cd 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -194,11 +194,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .scalarize(0); getActionDefinitionsBuilder({G_UADDE, G_UADDO, G_USUBE, G_USUBO}) - .legalFor({{s8, s1}, {s16, s1}, {s32, s1}}) - .legalFor(Is64Bit, {{s64, s1}}) + .legalFor({{s8, s8}, {s16, s8}, {s32, s8}}) + .legalFor(Is64Bit, {{s64, s8}}) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) - .clampScalar(1, s1, s1) + .clampScalar(1, s8, s8) .scalarize(0); // integer multiply diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp index 080a9c0..4e73070 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp @@ -84,11 +84,11 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits, case Xtensa::CCOMPARE0: if (FeatureBits[Xtensa::FeatureTimers1]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::CCOMPARE1: if (FeatureBits[Xtensa::FeatureTimers2]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::CCOMPARE2: if (FeatureBits[Xtensa::FeatureTimers3]) return true; @@ -107,37 +107,37 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits, case Xtensa::EXCSAVE1: case Xtensa::EXCVADDR: return FeatureBits[Xtensa::FeatureException]; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC2: case Xtensa::EPS2: case Xtensa::EXCSAVE2: if (FeatureBits[Xtensa::FeatureHighPriInterrupts]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC3: case Xtensa::EPS3: case Xtensa::EXCSAVE3: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel3]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC4: case Xtensa::EPS4: case Xtensa::EXCSAVE4: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel4]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC5: case Xtensa::EPS5: case Xtensa::EXCSAVE5: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel5]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC6: case Xtensa::EPS6: case Xtensa::EXCSAVE6: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel6]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC7: case Xtensa::EPS7: case Xtensa::EXCSAVE7: |