diff options
Diffstat (limited to 'llvm/lib/Target')
51 files changed, 1060 insertions, 966 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 4357264d..c76689f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -345,12 +345,6 @@ static unsigned getStackHazardSize(const MachineFunction &MF) { return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize(); } -/// Returns true if PPRs are spilled as ZPRs. -static bool arePPRsSpilledAsZPR(const MachineFunction &MF) { - return MF.getSubtarget().getRegisterInfo()->getSpillSize( - AArch64::PPRRegClass) == 16; -} - StackOffset AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -1966,8 +1960,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI; break; case RegPairInfo::PPR: - StrOpc = - Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI; + StrOpc = AArch64::STR_PXI; break; case RegPairInfo::VG: StrOpc = AArch64::STRXui; @@ -2178,8 +2171,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI; break; case RegPairInfo::PPR: - LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO - : AArch64::LDR_PXI; + LdrOpc = AArch64::LDR_PXI; break; case RegPairInfo::VG: continue; @@ -2286,9 +2278,7 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI, // Returns true if the LDST MachineInstr \p MI is a PPR access. static bool isPPRAccess(const MachineInstr &MI) { - return MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && - MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO && - AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()); + return AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()); } // Check if a Hazard slot is needed for the current function, and if so create @@ -2390,12 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot( return; } - if (arePPRsSpilledAsZPR(MF)) { - LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with " - "-aarch64-enable-zpr-predicate-spills"); - return; - } - // If another calling convention is explicitly set FPRs can't be promoted to // ZPR callee-saves. if (!is_contained({CallingConv::C, CallingConv::Fast, @@ -2519,14 +2503,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, continue; } - // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is - // spilled. If all of p0-p3 are used as return values p4 is must be free - // to reload p8-p15. - if (RegInfo->getSpillSize(AArch64::PPRRegClass) == 16 && - AArch64::PPR_p8to15RegClass.contains(Reg)) { - SavedRegs.set(AArch64::P4); - } - // MachO's compact unwind format relies on all registers being stored in // pairs. // FIXME: the usual format is actually better if unwinding isn't needed. @@ -2587,7 +2563,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, auto SpillSize = TRI->getSpillSize(*RC); bool IsZPR = AArch64::ZPRRegClass.contains(Reg); bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg); - if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF))) + if (IsZPR) ZPRCSStackSize += SpillSize; else if (IsPPR) PPRCSStackSize += SpillSize; @@ -2902,7 +2878,7 @@ static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, StackTop += MFI.getObjectSize(FI); StackTop = alignTo(StackTop, Alignment); - assert(StackTop < std::numeric_limits<int64_t>::max() && + assert(StackTop < (uint64_t)std::numeric_limits<int64_t>::max() && "SVE StackTop far too large?!"); int64_t Offset = -int64_t(StackTop); @@ -2961,314 +2937,8 @@ static SVEStackSizes determineSVEStackSizes(MachineFunction &MF, return SVEStack; } -/// Attempts to scavenge a register from \p ScavengeableRegs given the used -/// registers in \p UsedRegs. -static Register tryScavengeRegister(LiveRegUnits const &UsedRegs, - BitVector const &ScavengeableRegs, - Register PreferredReg) { - if (PreferredReg != AArch64::NoRegister && UsedRegs.available(PreferredReg)) - return PreferredReg; - for (auto Reg : ScavengeableRegs.set_bits()) { - if (UsedRegs.available(Reg)) - return Reg; - } - return AArch64::NoRegister; -} - -/// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in -/// \p MachineInstrs. -static void propagateFrameFlags(MachineInstr &SourceMI, - ArrayRef<MachineInstr *> MachineInstrs) { - for (MachineInstr *MI : MachineInstrs) { - if (SourceMI.getFlag(MachineInstr::FrameSetup)) - MI->setFlag(MachineInstr::FrameSetup); - if (SourceMI.getFlag(MachineInstr::FrameDestroy)) - MI->setFlag(MachineInstr::FrameDestroy); - } -} - -/// RAII helper class for scavenging or spilling a register. On construction -/// attempts to find a free register of class \p RC (given \p UsedRegs and \p -/// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p -/// MaybeSpillFI to free a register. The free'd register is returned via the \p -/// FreeReg output parameter. On destruction, if there is a spill, its previous -/// value is reloaded. The spilling and scavenging is only valid at the -/// insertion point \p MBBI, this class should _not_ be used in places that -/// create or manipulate basic blocks, moving the expected insertion point. -struct ScopedScavengeOrSpill { - ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete; - ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete; - - ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - Register SpillCandidate, const TargetRegisterClass &RC, - LiveRegUnits const &UsedRegs, - BitVector const &AllocatableRegs, - std::optional<int> *MaybeSpillFI, - Register PreferredReg = AArch64::NoRegister) - : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>( - *MF.getSubtarget().getInstrInfo())), - TRI(*MF.getSubtarget().getRegisterInfo()) { - FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs, PreferredReg); - if (FreeReg != AArch64::NoRegister) - return; - assert(MaybeSpillFI && "Expected emergency spill slot FI information " - "(attempted to spill in prologue/epilogue?)"); - if (!MaybeSpillFI->has_value()) { - MachineFrameInfo &MFI = MF.getFrameInfo(); - *MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC), - TRI.getSpillAlign(RC)); - } - FreeReg = SpillCandidate; - SpillFI = MaybeSpillFI->value(); - TII.storeRegToStackSlot(MBB, MBBI, FreeReg, false, *SpillFI, &RC, &TRI, - Register()); - } - - bool hasSpilled() const { return SpillFI.has_value(); } - - /// Returns the free register (found from scavenging or spilling a register). - Register freeRegister() const { return FreeReg; } - - Register operator*() const { return freeRegister(); } - - ~ScopedScavengeOrSpill() { - if (hasSpilled()) - TII.loadRegFromStackSlot(MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI, - Register()); - } - -private: - MachineBasicBlock &MBB; - MachineBasicBlock::iterator MBBI; - const TargetRegisterClass &RC; - const AArch64InstrInfo &TII; - const TargetRegisterInfo &TRI; - Register FreeReg = AArch64::NoRegister; - std::optional<int> SpillFI; -}; - -/// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and -/// FILL_PPR_FROM_ZPR_SLOT_PSEUDO. -struct EmergencyStackSlots { - std::optional<int> ZPRSpillFI; - std::optional<int> PPRSpillFI; - std::optional<int> GPRSpillFI; -}; - -/// Registers available for scavenging (ZPR, PPR3b, GPR). -struct ScavengeableRegs { - BitVector ZPRRegs; - BitVector PPR3bRegs; - BitVector GPRRegs; -}; - -static bool isInPrologueOrEpilogue(const MachineInstr &MI) { - return MI.getFlag(MachineInstr::FrameSetup) || - MI.getFlag(MachineInstr::FrameDestroy); -} - -/// Expands: -/// ``` -/// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 -/// ``` -/// To: -/// ``` -/// $z0 = CPY_ZPzI_B $p0, 1, 0 -/// STR_ZXI $z0, $stack.0, 0 -/// ``` -/// While ensuring a ZPR ($z0 in this example) is free for the predicate ( -/// spilling if necessary). -static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB, - MachineInstr &MI, - const TargetRegisterInfo &TRI, - LiveRegUnits const &UsedRegs, - ScavengeableRegs const &SR, - EmergencyStackSlots &SpillSlots) { - MachineFunction &MF = *MBB.getParent(); - auto *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - - ScopedScavengeOrSpill ZPredReg( - MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs, - isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI); - - SmallVector<MachineInstr *, 2> MachineInstrs; - const DebugLoc &DL = MI.getDebugLoc(); - MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::CPY_ZPzI_B)) - .addReg(*ZPredReg, RegState::Define) - .add(MI.getOperand(0)) - .addImm(1) - .addImm(0) - .getInstr()); - MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::STR_ZXI)) - .addReg(*ZPredReg) - .add(MI.getOperand(1)) - .addImm(MI.getOperand(2).getImm()) - .setMemRefs(MI.memoperands()) - .getInstr()); - propagateFrameFlags(MI, MachineInstrs); -} - -/// Expands: -/// ``` -/// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 -/// ``` -/// To: -/// ``` -/// $z0 = LDR_ZXI %stack.0, 0 -/// $p0 = PTRUE_B 31, implicit $vg -/// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv -/// ``` -/// While ensuring a ZPR ($z0 in this example) is free for the predicate ( -/// spilling if necessary). If the status flags are in use at the point of -/// expansion they are preserved (by moving them to/from a GPR). This may cause -/// an additional spill if no GPR is free at the expansion point. -static bool expandFillPPRFromZPRSlotPseudo( - MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI, - LiveRegUnits const &UsedRegs, ScavengeableRegs const &SR, - MachineInstr *&LastPTrue, EmergencyStackSlots &SpillSlots) { - MachineFunction &MF = *MBB.getParent(); - auto *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - - ScopedScavengeOrSpill ZPredReg( - MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs, - isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI); - - ScopedScavengeOrSpill PredReg( - MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs, - isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI, - /*PreferredReg=*/ - LastPTrue ? LastPTrue->getOperand(0).getReg() : AArch64::NoRegister); - - // Elide NZCV spills if we know it is not used. - bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV); - std::optional<ScopedScavengeOrSpill> NZCVSaveReg; - if (IsNZCVUsed) - NZCVSaveReg.emplace( - MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs, - isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI); - SmallVector<MachineInstr *, 4> MachineInstrs; - const DebugLoc &DL = MI.getDebugLoc(); - MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI)) - .addReg(*ZPredReg, RegState::Define) - .add(MI.getOperand(1)) - .addImm(MI.getOperand(2).getImm()) - .setMemRefs(MI.memoperands()) - .getInstr()); - if (IsNZCVUsed) - MachineInstrs.push_back( - BuildMI(MBB, MI, DL, TII->get(AArch64::MRS)) - .addReg(NZCVSaveReg->freeRegister(), RegState::Define) - .addImm(AArch64SysReg::NZCV) - .addReg(AArch64::NZCV, RegState::Implicit) - .getInstr()); - - // Reuse previous ptrue if we know it has not been clobbered. - if (LastPTrue) { - assert(*PredReg == LastPTrue->getOperand(0).getReg()); - LastPTrue->moveBefore(&MI); - } else { - LastPTrue = BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B)) - .addReg(*PredReg, RegState::Define) - .addImm(31); - } - MachineInstrs.push_back(LastPTrue); - MachineInstrs.push_back( - BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B)) - .addReg(MI.getOperand(0).getReg(), RegState::Define) - .addReg(*PredReg) - .addReg(*ZPredReg) - .addImm(0) - .addReg(AArch64::NZCV, RegState::ImplicitDefine) - .getInstr()); - if (IsNZCVUsed) - MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MSR)) - .addImm(AArch64SysReg::NZCV) - .addReg(NZCVSaveReg->freeRegister()) - .addReg(AArch64::NZCV, RegState::ImplicitDefine) - .getInstr()); - - propagateFrameFlags(MI, MachineInstrs); - return PredReg.hasSpilled(); -} - -/// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO -/// operations within the MachineBasicBlock \p MBB. -static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB, - const TargetRegisterInfo &TRI, - ScavengeableRegs const &SR, - EmergencyStackSlots &SpillSlots) { - LiveRegUnits UsedRegs(TRI); - UsedRegs.addLiveOuts(MBB); - bool HasPPRSpills = false; - MachineInstr *LastPTrue = nullptr; - for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) { - UsedRegs.stepBackward(MI); - switch (MI.getOpcode()) { - case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO: - if (LastPTrue && - MI.definesRegister(LastPTrue->getOperand(0).getReg(), &TRI)) - LastPTrue = nullptr; - HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, - LastPTrue, SpillSlots); - MI.eraseFromParent(); - break; - case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO: - expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots); - MI.eraseFromParent(); - [[fallthrough]]; - default: - LastPTrue = nullptr; - break; - } - } - - return HasPPRSpills; -} - void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { - - AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - const TargetSubtargetInfo &TSI = MF.getSubtarget(); - const TargetRegisterInfo &TRI = *TSI.getRegisterInfo(); - - // If predicates spills are 16-bytes we may need to expand - // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO. - if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) { - auto ComputeScavengeableRegisters = [&](unsigned RegClassID) { - BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID)); - assert(Regs.count() > 0 && "Expected scavengeable registers"); - return Regs; - }; - - ScavengeableRegs SR{}; - SR.ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID); - // Only p0-7 are possible as the second operand of cmpne (needed for fills). - SR.PPR3bRegs = ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID); - SR.GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID); - - EmergencyStackSlots SpillSlots; - for (MachineBasicBlock &MBB : MF) { - // In the case we had to spill a predicate (in the range p0-p7) to reload - // a predicate (>= p8), additional spill/fill pseudos will be created. - // These need an additional expansion pass. Note: There will only be at - // most two expansion passes, as spilling/filling a predicate in the range - // p0-p7 never requires spilling another predicate. - for (int Pass = 0; Pass < 2; Pass++) { - bool HasPPRSpills = - expandSMEPPRToZPRSpillPseudos(MBB, TRI, SR, SpillSlots); - assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills"); - if (!HasPPRSpills) - break; - } - } - } - - MachineFrameInfo &MFI = MF.getFrameInfo(); - assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && "Upwards growing stack unsupported"); @@ -3279,6 +2949,9 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( if (!MF.hasEHFunclets()) return; + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + // Win64 C++ EH needs to allocate space for the catch objects in the fixed // object area right next to the UnwindHelp object. WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); @@ -4280,18 +3953,10 @@ void AArch64FrameLowering::emitRemarks( } unsigned RegTy = StackAccess::AccessType::GPR; - if (MFI.hasScalableStackID(FrameIdx)) { - // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO - // spill/fill the predicate as a data vector (so are an FPR access). - if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO && - MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO && - AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) { - RegTy = StackAccess::PPR; - } else - RegTy = StackAccess::FPR; - } else if (AArch64InstrInfo::isFpOrNEON(MI)) { + if (MFI.hasScalableStackID(FrameIdx)) + RegTy = isPPRAccess(MI) ? StackAccess::PPR : StackAccess::FPR; + else if (AArch64InstrInfo::isFpOrNEON(MI)) RegTy = StackAccess::FPR; - } StackAccesses[ArrIdx].AccessTypes |= RegTy; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5a90da1..b8761d97 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2579,8 +2579,6 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { case AArch64::STZ2Gi: case AArch64::STZGi: case AArch64::TAGPstack: - case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO: - case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO: return 2; case AArch64::LD1B_D_IMM: case AArch64::LD1B_H_IMM: @@ -4387,8 +4385,6 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, MinOffset = -256; MaxOffset = 254; break; - case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO: - case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO: case AArch64::LDR_ZXI: case AArch64::STR_ZXI: Scale = TypeSize::getScalable(16); @@ -5098,33 +5094,31 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else if (Subtarget.hasZeroCycleRegMoveGPR64() && + !Subtarget.hasZeroCycleRegMoveGPR32()) { + // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. + MCRegister DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + assert(DestRegX.isValid() && "Destination super-reg not valid"); + MCRegister SrcRegX = + SrcReg == AArch64::WZR + ? AArch64::XZR + : TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + assert(SrcRegX.isValid() && "Source super-reg not valid"); + // This instruction is reading and writing X registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegX, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) + .addReg(AArch64::XZR) + .addReg(SrcRegX, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); } else { - if (Subtarget.hasZeroCycleRegMoveGPR64() && - !Subtarget.hasZeroCycleRegMoveGPR32()) { - // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. - MCRegister DestRegX = TRI->getMatchingSuperReg( - DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); - assert(DestRegX.isValid() && "Destination super-reg not valid"); - MCRegister SrcRegX = - SrcReg == AArch64::WZR - ? AArch64::XZR - : TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, - &AArch64::GPR64spRegClass); - assert(SrcRegX.isValid() && "Source super-reg not valid"); - // This instruction is reading and writing X registers. This may upset - // the register scavenger and machine verifier, so we need to indicate - // that we are reading an undefined value from SrcRegX, but a proper - // value from SrcReg. - BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) - .addReg(AArch64::XZR) - .addReg(SrcRegX, RegState::Undef) - .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); - } else { - // Otherwise, expand to ORR WZR. - BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) - .addReg(AArch64::WZR) - .addReg(SrcReg, getKillRegState(KillSrc)); - } + // Otherwise, expand to ORR WZR. + BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) + .addReg(AArch64::WZR) + .addReg(SrcReg, getKillRegState(KillSrc)); } return; } @@ -5650,11 +5644,6 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZXI; StackID = TargetStackID::ScalableVector; - } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.isSVEorStreamingSVEAvailable() && - "Unexpected predicate store without SVE store instructions"); - Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO; - StackID = TargetStackID::ScalableVector; } break; case 24: @@ -5835,11 +5824,6 @@ void AArch64InstrInfo::loadRegFromStackSlot( "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZXI; StackID = TargetStackID::ScalableVector; - } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { - assert(Subtarget.isSVEorStreamingSVEAvailable() && - "Unexpected predicate load without SVE load instructions"); - Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO; - StackID = TargetStackID::ScalableVector; } break; case 24: diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index aed137c..1568161 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -57,10 +57,7 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) { case AArch64::ST1B_2Z_IMM: case AArch64::STR_ZXI: case AArch64::LDR_ZXI: - case AArch64::CPY_ZPzI_B: - case AArch64::CMPNE_PPzZI_B: case AArch64::PTRUE_C_B: - case AArch64::PTRUE_B: return I->getFlag(MachineInstr::FrameSetup) || I->getFlag(MachineInstr::FrameDestroy); case AArch64::SEH_SavePReg: diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 5d89862..ef974df 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -980,19 +980,10 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size, //****************************************************************************** // SVE predicate register classes. - -// Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet() -// (without the use of the table-gen'd predicates). -def SMEWithZPRPredicateSpills : HwMode<[Predicate<"false">]>; - -def PPRSpillFillRI : RegInfoByHwMode< - [DefaultMode, SMEWithZPRPredicateSpills], - [RegInfo<16,16,16>, RegInfo<16,128,128>]>; - class PPRClass<int firstreg, int lastreg, int step = 1> : RegisterClass<"AArch64", [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16, (sequence "P%u", firstreg, lastreg, step)> { - let RegInfos = PPRSpillFillRI; + let Size = 16; } def PPR : PPRClass<0, 15> { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 98e0a11..12ddf47 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -86,11 +86,6 @@ static cl::alias AArch64StreamingStackHazardSize( cl::desc("alias for -aarch64-streaming-hazard-size"), cl::aliasopt(AArch64StreamingHazardSize)); -static cl::opt<bool> EnableZPRPredicateSpills( - "aarch64-enable-zpr-predicate-spills", cl::init(false), cl::Hidden, - cl::desc( - "Enables spilling/reloading SVE predicates as data vectors (ZPRs)")); - static cl::opt<unsigned> VScaleForTuningOpt("sve-vscale-for-tuning", cl::Hidden, cl::desc("Force a vscale for tuning factor for SVE")); @@ -426,20 +421,6 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, EnableSubregLiveness = EnableSubregLivenessTracking.getValue(); } -unsigned AArch64Subtarget::getHwModeSet() const { - AArch64HwModeBits Modes = AArch64HwModeBits::DefaultMode; - - // Use a special hardware mode in streaming[-compatible] functions with - // aarch64-enable-zpr-predicate-spills. This changes the spill size (and - // alignment) for the predicate register class. - if (EnableZPRPredicateSpills.getValue() && - (isStreaming() || isStreamingCompatible())) { - Modes |= AArch64HwModeBits::SMEWithZPRPredicateSpills; - } - - return to_underlying(Modes); -} - const CallLowering *AArch64Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 671df35..8974965 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -130,8 +130,6 @@ public: bool IsStreaming = false, bool IsStreamingCompatible = false, bool HasMinSize = false); - virtual unsigned getHwModeSet() const override; - // Getters for SubtargetFeatures defined in tablegen #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ bool GETTER() const { return ATTRIBUTE; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 50a8754..479e345 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5666,18 +5666,21 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( VectorType *AccumVectorType = VectorType::get(AccumType, VF.divideCoefficientBy(Ratio)); // We don't yet support all kinds of legalization. - auto TA = TLI->getTypeAction(AccumVectorType->getContext(), - EVT::getEVT(AccumVectorType)); - switch (TA) { + auto TC = TLI->getTypeConversion(AccumVectorType->getContext(), + EVT::getEVT(AccumVectorType)); + switch (TC.first) { default: return Invalid; case TargetLowering::TypeLegal: case TargetLowering::TypePromoteInteger: case TargetLowering::TypeSplitVector: + // The legalised type (e.g. after splitting) must be legal too. + if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) != + TargetLowering::TypeLegal) + return Invalid; break; } - // Check what kind of type-legalisation happens. std::pair<InstructionCost, MVT> AccumLT = getTypeLegalizationCost(AccumVectorType); std::pair<InstructionCost, MVT> InputLT = diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index be44b8f..33f35ad 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -58,20 +58,6 @@ def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO : let hasSideEffects = 0; } -def SPILL_PPR_TO_ZPR_SLOT_PSEUDO : - Pseudo<(outs), (ins PPRorPNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]> -{ - let mayStore = 1; - let hasSideEffects = 0; -} - -def FILL_PPR_FROM_ZPR_SLOT_PSEUDO : - Pseudo<(outs PPRorPNRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]> -{ - let mayLoad = 1; - let hasSideEffects = 0; -} - def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; // SME ZA loads and stores def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ddb2381..1a697f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1411,20 +1411,6 @@ def FeatureGloballyAddressableScratch : SubtargetFeature< "FLAT instructions can access scratch memory for any thread in any wave" >; -// FIXME: Remove after all users are migrated to attribute. -def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", - "DynamicVGPR", - "true", - "Enable dynamic VGPR mode" ->; - -// FIXME: Remove after all users are migrated to attribute. -def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32", - "DynamicVGPRBlockSize32", - "true", - "Use a block size of 32 for dynamic VGPR allocation (default is 16)" ->; - // Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and // restoring the callee-saved registers. def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr", @@ -1462,10 +1448,10 @@ def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records "The buffer resource (V#) supports 45-bit num_records" >; -def FeatureCluster : SubtargetFeature< "cluster", - "HasCluster", +def FeatureClusters : SubtargetFeature< "clusters", + "HasClusters", "true", - "Has cluster support" + "Has clusters of workgroups support" >; // Dummy feature used to disable assembler instructions. @@ -2134,7 +2120,7 @@ def FeatureISAVersion12_50 : FeatureSet< Feature45BitNumRecordsBufferResource, FeatureSupportsXNACK, FeatureXNACK, - FeatureCluster, + FeatureClusters, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 848d9a5..557d87f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5043,6 +5043,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: case Intrinsic::amdgcn_mfma_f32_16x16x32_bf16: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src @@ -5051,29 +5054,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // vdst, srcA, srcB, srcC const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned MinNumRegsRequired = DstSize / 32; + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); OpdsMapping[0] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + Info->getMinNumAGPRs() >= MinNumRegsRequired ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index a67a7be..d0c0822 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1944,6 +1944,7 @@ public: void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); void cvtVINTERP(MCInst &Inst, const OperandVector &Operands); + void cvtOpSelHelper(MCInst &Inst, unsigned OpSel); bool parseDimId(unsigned &Encoding); ParseStatus parseDim(OperandVector &Operands); @@ -9239,6 +9240,33 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { MCOI::OperandConstraint::TIED_TO) == -1; } +void AMDGPUAsmParser::cvtOpSelHelper(MCInst &Inst, unsigned OpSel) { + unsigned Opc = Inst.getOpcode(); + constexpr AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2}; + constexpr AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); + if (OpIdx == -1) + // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but + // no src1. So continue instead of break. + continue; + + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); + + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + // op_sel[3] is encoded in src0_modifiers. + if (ModOps[J] == AMDGPU::OpName::src0_modifiers && (OpSel & (1 << 3)) != 0) + ModVal |= SISrcMods::DST_OP_SEL; + + Inst.getOperand(ModIdx).setImm(ModVal); + } +} + void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; @@ -9275,6 +9303,16 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + + // Some v_interp instructions use op_sel[3] for dst. + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyOpSel); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + cvtOpSelHelper(Inst, OpSel); + } } void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) @@ -9310,31 +9348,10 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) if (OpSelIdx == -1) return; - const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, - AMDGPU::OpName::src2}; - const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers}; - unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); - - for (int J = 0; J < 3; ++J) { - int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); - if (OpIdx == -1) - break; - - int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); - uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); - - if ((OpSel & (1 << J)) != 0) - ModVal |= SISrcMods::OP_SEL_0; - if (ModOps[J] == AMDGPU::OpName::src0_modifiers && - (OpSel & (1 << 3)) != 0) - ModVal |= SISrcMods::DST_OP_SEL; - - Inst.getOperand(ModIdx).setImm(ModVal); - } + cvtOpSelHelper(Inst, OpSel); } + void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 7b94ea3..f291e37 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -541,7 +541,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { unsigned GCNSubtarget::getBaseMaxNumVGPRs( const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const { - const auto &[Min, Max] = NumVGPRBounds; + const auto [Min, Max] = NumVGPRBounds; // Check if maximum number of VGPRs was explicitly requested using // "amdgpu-num-vgpr" attribute. diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 879bf5a..c2e6078 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -288,7 +288,7 @@ protected: bool Has45BitNumRecordsBufferResource = false; - bool HasCluster = false; + bool HasClusters = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1839,7 +1839,7 @@ public: } /// \returns true if the subtarget supports clusters of workgroups. - bool hasClusters() const { return HasCluster; } + bool hasClusters() const { return HasClusters; } /// \returns true if the subtarget requires a wait for xcnt before atomic /// flat/global stores & rmw. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index d3b5718..3563caa 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1280,6 +1280,17 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } + // Some instructions, e.g. v_interp_p2_f16 in GFX9, have src0, src2, but no + // src1. + if (NumOps == 1 && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src2) && + !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1)) { + Ops[NumOps++] = DefaultValue; // Set src1_modifiers to default. + int Mod2Idx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers); + assert(Mod2Idx != -1); + Ops[NumOps++] = MI->getOperand(Mod2Idx).getImm(); + } + const bool HasDst = (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst) != -1) || (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst) != -1); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e233457..1a686a9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17346,74 +17346,24 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); - // Prefer VGPRs over AGPRs in mAI instructions where possible. - // This saves a chain-copy of registers and better balance register - // use between vgpr and agpr as agpr tuples tend to be big. - if (!MI.getDesc().operands().empty()) { - unsigned Opc = MI.getOpcode(); - bool HasAGPRs = Info->mayNeedAGPRs(); - const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - for (auto I : - {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { - if (I == -1) - break; - if ((I == Src2Idx) && (HasAGPRs)) - break; - MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !Op.getReg().isVirtual()) - continue; - auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); - if (!TRI->hasAGPRs(RC)) - continue; - auto *Src = MRI.getUniqueVRegDef(Op.getReg()); - if (!Src || !Src->isCopy() || - !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) - continue; - auto *NewRC = TRI->getEquivalentVGPRClass(RC); - // All uses of agpr64 and agpr32 can also accept vgpr except for - // v_accvgpr_read, but we do not produce agpr reads during selection, - // so no use checks are needed. - MRI.setRegClass(Op.getReg(), NewRC); - } - - if (TII->isMAI(MI)) { - // The ordinary src0, src1, src2 were legalized above. - // - // We have to also legalize the appended v_mfma_ld_scale_b32 operands, - // as a separate instruction. - int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src0); - if (Src0Idx != -1) { - int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::scale_src1); - if (TII->usesConstantBus(MRI, MI, Src0Idx) && - TII->usesConstantBus(MRI, MI, Src1Idx)) - TII->legalizeOpWithMove(MI, Src1Idx); - } - } - - if (!HasAGPRs) - return; - - // Resolve the rest of AV operands to AGPRs. - if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { - if (Src2->isReg() && Src2->getReg().isVirtual()) { - auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); - if (TRI->isVectorSuperClass(RC)) { - auto *NewRC = TRI->getEquivalentAGPRClass(RC); - MRI.setRegClass(Src2->getReg(), NewRC); - if (Src2->isTied()) - MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); - } - } + if (TII->isMAI(MI)) { + // The ordinary src0, src1, src2 were legalized above. + // + // We have to also legalize the appended v_mfma_ld_scale_b32 operands, + // as a separate instruction. + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src0); + if (Src0Idx != -1) { + int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::scale_src1); + if (TII->usesConstantBus(MRI, MI, Src0Idx) && + TII->usesConstantBus(MRI, MI, Src1Idx)) + TII->legalizeOpWithMove(MI, Src1Idx); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 908d856..b398db4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -33,17 +33,20 @@ using namespace llvm; // optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases // where it is better to produce the VGPR form (e.g. if there are VGPR users // of the MFMA result). -static cl::opt<bool> MFMAVGPRForm( - "amdgpu-mfma-vgpr-form", cl::Hidden, +static cl::opt<bool, true> MFMAVGPRFormOpt( + "amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), - cl::init(false)); + cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false), + cl::Hidden); const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine()); } +bool SIMachineFunctionInfo::MFMAVGPRForm = false; + SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), @@ -81,14 +84,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts(); if (ST.hasGFX90AInsts()) { - // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection - // should be separated from availability of AGPRs - if (MFMAVGPRForm || - (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() && - !mayUseAGPRs(F))) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. + // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the + // allocation granule and clamping. + auto [MinNumAGPRAttr, MaxNumAGPRAttr] = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u}, + /*OnlyFirstRequired=*/true); + MinNumAGPRs = MinNumAGPRAttr; } if (AMDGPU::isChainCC(CC)) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 4560615..b7dbb59 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -509,7 +509,9 @@ private: // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; - bool MayNeedAGPRs : 1; + /// Minimum number of AGPRs required to allocate in the function. Only + /// relevant for gfx90a-gfx950. For gfx908, this should be infinite. + unsigned MinNumAGPRs = ~0u; // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since @@ -537,6 +539,8 @@ private: void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; public: + static bool MFMAVGPRForm; + struct VGPRSpillToAGPR { SmallVector<MCPhysReg, 32> Lanes; bool FullyAllocated = false; @@ -1196,9 +1200,7 @@ public: unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; } - bool mayNeedAGPRs() const { - return MayNeedAGPRs; - } + unsigned getMinNumAGPRs() const { return MinNumAGPRs; } // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 3c2dd42..3115579 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1118,12 +1118,7 @@ SIRegisterInfo::getPointerRegClass(unsigned Kind) const { const TargetRegisterClass * SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { - if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) - return getEquivalentVGPRClass(RC); - if (RC == &AMDGPU::SCC_CLASSRegClass) - return getWaveMaskRegClass(); - - return RC; + return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC; } static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 20fa141..f7f4d46 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1353,11 +1353,6 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, if (DynamicVGPRBlockSize != 0) return DynamicVGPRBlockSize; - // Temporarily check the subtarget feature, until we fully switch to using - // attributes. - if (STI->getFeatureBits().test(FeatureDynamicVGPR)) - return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16; - bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -1412,10 +1407,7 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, if (Features.test(FeatureGFX90AInsts)) return 512; - // Temporarily check the subtarget feature, until we fully switch to using - // attributes. - if (DynamicVGPRBlockSize != 0 || - STI->getFeatureBits().test(FeatureDynamicVGPR)) + if (DynamicVGPRBlockSize != 0) // On GFX12 we can allocate at most 8 blocks of VGPRs. return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize); return getAddressableNumArchVGPRs(STI); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 4a2b54d..42ec8ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -97,6 +97,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> : VOP3_Pseudo<OpName, P, pattern> { let AsmMatchConverter = "cvtVOP3Interp"; let mayRaiseFPException = 0; + let VOP3_OPSEL = P.HasOpSel; } def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { @@ -119,16 +120,17 @@ def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> { let HasSrc0Mods = 0; } -class getInterp16Asm <bit HasSrc2, bit HasOMod> { +class getInterp16Asm <bit HasSrc2, bit HasOMod, bit OpSel> { string src2 = !if(HasSrc2, ", $src2_modifiers", ""); string omod = !if(HasOMod, "$omod", ""); + string opsel = !if(OpSel, "$op_sel", ""); string ret = - " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod; + " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod#opsel; } class getInterp16Ins <bit HasSrc2, bit HasOMod, - Operand Src0Mod, Operand Src2Mod> { - dag ret = !if(HasSrc2, + Operand Src0Mod, Operand Src2Mod, bit OpSel> { + dag ret1 = !if(HasSrc2, !if(HasOMod, (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, InterpAttr:$attr, InterpAttrChan:$attrchan, @@ -143,19 +145,22 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, InterpAttr:$attr, InterpAttrChan:$attrchan, highmod:$high, Clamp0:$clamp, omod0:$omod) ); + dag ret2 = !if(OpSel, (ins op_sel0:$op_sel), (ins)); + dag ret = !con(ret1, ret2); } -class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { +class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> { let IsSingle = 1; let HasOMod = !ne(DstVT.Value, f16.Value); let HasHigh = 1; + let HasOpSel = OpSel; let Src0Mod = FPVRegInputMods; let Src2Mod = FPVRegInputMods; let Outs64 = (outs DstRC.RegClass:$vdst); - let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret; - let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret; + let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod, OpSel>.ret; + let Asm64 = getInterp16Asm<HasSrc2, HasOMod, OpSel>.ret; } //===----------------------------------------------------------------------===// @@ -480,7 +485,7 @@ let SubtargetPredicate = isGFX9Plus in { defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>; defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>; let OtherPredicates = [isNotGFX90APlus] in -def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; +def V_INTERP_P2_F16_opsel : VOP3Interp <"v_interp_p2_f16_opsel", VOP3_INTERP16<[f16, f32, i32, f32], /*OpSel*/ 1>>; } // End SubtargetPredicate = isGFX9Plus // This predicate should only apply to the selection pattern. The @@ -2676,6 +2681,14 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> } } +multiclass VOP3Interp_F16_OpSel_Real_gfx9<bits<10> op, string OpName, string AsmName> { + def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, + VOP3Interp_OpSel_gfx9 <op, !cast<VOP3_Pseudo>(OpName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName); + let AsmString = AsmName # ps.AsmOperands; + } +} + multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> { def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>, VOP3e_vi <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> { @@ -2788,7 +2801,7 @@ defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">; defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">; -defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">; +defm V_INTERP_P2_F16_opsel : VOP3Interp_F16_OpSel_Real_gfx9 <0x277, "V_INTERP_P2_F16_opsel", "v_interp_p2_f16">; defm V_ADD_I32 : VOP3_Real_vi <0x29c>; defm V_SUB_I32 : VOP3_Real_vi <0x29d>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 5daf860..3a0cc35 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -67,7 +67,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3P_Mix_Profile<P, Features, 0> { let IsTrue16 = 1; - let IsRealTrue16 = 1; + let IsRealTrue16 = 1; let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret; } @@ -950,7 +950,7 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> : } // Currently assumes scaled instructions never have abid -class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag < +class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : PatFrag < !if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp, node:$src0_modifiers, node:$scale_src0, node:$src1_modifiers, node:$scale_src1), @@ -959,37 +959,30 @@ class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = (ops node:$blgp))), !if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1), !if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), - (Op $src0, $src1, $src2, $cbsz, $blgp))), - pred ->; - -defvar MayNeedAGPRs = [{ - return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; - -defvar MayNeedAGPRs_gisel = [{ - return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + (Op $src0, $src1, $src2, $cbsz, $blgp)))>; -defvar MayNotNeedAGPRs = [{ - return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; +class CanUseAGPR_MAI<ValueType vt> { + code PredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; -defvar MayNotNeedAGPRs_gisel = [{ - return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); -}]; + code GISelPredicateCode = [{ + return !Subtarget->hasGFX90AInsts() || + (!SIMachineFunctionInfo::MFMAVGPRForm && + MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >= + }] # !srl(vt.Size, 5) # ");"; +} -class AgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, +class AgprMAIFrag<SDPatternOperator Op, ValueType vt, bit HasAbid = true, bit Scaled = false> : - MAIFrag<Op, MayNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNeedAGPRs_gisel; -} + MAIFrag<Op, HasAbid, Scaled>, + CanUseAGPR_MAI<vt>; class VgprMAIFrag<SDPatternOperator Op, bit HasAbid = true, - bit Scaled = false> : - MAIFrag<Op, MayNotNeedAGPRs, HasAbid, Scaled> { - let GISelPredicateCode = MayNotNeedAGPRs_gisel; -} + bit Scaled = false> : + MAIFrag<Op, HasAbid, Scaled>; let isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; @@ -1037,16 +1030,19 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, bit HasAbid = true, bit Scaled = false> { defvar NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap; + defvar ProfileAGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P); + defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"); + let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { - def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def _e64 : MAIInst<OpName, ProfileAGPR, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<0, "AGPR", NAME # "_e64">; let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in - def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", ProfileVGPR, !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">; } @@ -1055,12 +1051,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag, let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { - def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, HasAbid, Scaled>), Scaled>, + def "_mac_e64" : MAIInst<OpName # "_mac", ProfileAGPR, + !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node, ProfileAGPR.DstVT, HasAbid, Scaled>), Scaled>, MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">; let OtherPredicates = [isGFX90APlus] in - def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", ProfileVGPR, !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node, HasAbid, Scaled>), Scaled>, MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; } @@ -1074,11 +1070,11 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper defvar UnscaledOpName = UnscaledOpName_#VariantSuffix; defvar HasAbid = false; - - defvar NoDstOverlap = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl).NoDstOverlap; + defvar Profile = !cast<VOPProfileMAI>(!cast<MAIInst>(UnscaledOpName#"_e64").Pfl); + defvar NoDstOverlap = Profile.NoDstOverlap; def _e64 : ScaledMAIInst<OpName, - !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, HasAbid, true>)>, + !cast<MAIInst>(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag<node, Profile.DstVT, HasAbid, true>)>, MFMATable<0, "AGPR", NAME # "_e64">; def _vgprcd_e64 : ScaledMAIInst<OpName # "_vgprcd", @@ -1090,7 +1086,7 @@ multiclass ScaledMAIInst_mc<string OpName, string UnscaledOpName_, SDPatternOper isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = UnscaledOpName_ in { def _mac_e64 : ScaledMAIInst<OpName # "_mac", - !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, HasAbid, true>>, + !cast<MAIInst>(UnscaledOpName # "_mac_e64"), AgprMAIFrag<node, Profile.DstVT, HasAbid, true>>, MFMATable<1, "AGPR", NAME # "_e64">; def _mac_vgprcd_e64 : ScaledMAIInst<OpName # " _mac_vgprcd", diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 631f0f3..8325c62 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -419,6 +419,13 @@ class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, let Inst{14-11} = scale_sel; } +class VOP3Interp_OpSel_gfx9<bits<10> op, VOPProfile p> : VOP3Interp_vi<op, p> { + let Inst{11} = src0_modifiers{2}; + // There's no src1 + let Inst{13} = src2_modifiers{2}; + let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0); +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 28d4bb9..a8b854f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -4528,6 +4528,10 @@ class WMMA_REGINFO<WMMA_REGS r, string op, string metadata = "", string kind = " !eq(ptx_elt_type, "e2m1"), !ne(kind, "")) : [hasSM120a, hasPTX<87>], + !and(!or(!eq(ptx_elt_type,"e4m3"), + !eq(ptx_elt_type,"e5m2")), + !eq(geom, "m16n8k16")) : [hasSM<89>, hasPTX<87>], + !or(!eq(ptx_elt_type, "e4m3"), !eq(ptx_elt_type, "e5m2")) : [hasSM<89>, hasPTX<84>], @@ -4543,6 +4547,11 @@ class WMMA_REGINFO<WMMA_REGS r, string op, string metadata = "", string kind = " !and(!eq(geom, "m8n8k4"), !eq(ptx_elt_type, "f64")) : [hasSM<80>, hasPTX<70>], + !and(!or(!eq(geom, "m16n8k4"), + !eq(geom, "m16n8k8"), + !eq(geom, "m16n8k16")), + !eq(ptx_elt_type, "f64")) : [hasSM<90>, hasPTX<78>], + // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16 !and(!or(!eq(geom, "m8n32k16"), !eq(geom, "m32n8k16")), @@ -4827,8 +4836,8 @@ defset list<WMMA_INSTR> WMMAs = { // MMA class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, WMMA_REGINFO FragC, WMMA_REGINFO FragD, - string ALayout, string BLayout, int Satfinite, string b1op> - : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, FragA, FragB, FragC, FragD>.record, + string ALayout, string BLayout, int Satfinite, string b1op, string Kind> + : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record, [FragA.Ins, FragB.Ins, FragC.Ins]>, // Requires does not seem to have effect on Instruction w/o Patterns. // We set it here anyways and propagate to the Pat<> we construct below. @@ -4843,6 +4852,7 @@ class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, # FragA.geom # "." # ALayout # "." # BLayout + # !if(!ne(Kind, ""), "." # Kind, "") # !if(Satfinite, ".satfinite", "") # TypeList # b1op # "\n\t\t" @@ -4859,13 +4869,15 @@ defset list<WMMA_INSTR> MMAs = { foreach satf = [0, 1] in { foreach op = NVVM_MMA_OPS.all_mma_ops in { foreach b1op = NVVM_MMA_B1OPS<op>.ret in { - if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then { - def : MMA<WMMA_REGINFO<op[0], "mma">, - WMMA_REGINFO<op[1], "mma">, - WMMA_REGINFO<op[2], "mma">, - WMMA_REGINFO<op[3], "mma">, - layout_a, layout_b, satf, b1op>; - } + foreach kind = ["", "kind::f8f6f4"] in { + if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, kind, satf>.ret then { + def : MMA<WMMA_REGINFO<op[0], "mma", "", kind>, + WMMA_REGINFO<op[1], "mma", "", kind>, + WMMA_REGINFO<op[2], "mma", "", kind>, + WMMA_REGINFO<op[3], "mma", "", kind>, + layout_a, layout_b, satf, b1op, kind>; + } + } // kind } // b1op } // op } // satf diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 1fc475d..561a9c5 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -349,32 +349,30 @@ public: bool isImm() const override { return Kind == Immediate || Kind == Expression; } - bool isU1Imm() const { return Kind == Immediate && isUInt<1>(getImm()); } - bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); } - bool isU3Imm() const { return Kind == Immediate && isUInt<3>(getImm()); } - bool isU4Imm() const { return Kind == Immediate && isUInt<4>(getImm()); } - bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); } - bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); } - bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); } - bool isU6ImmX2() const { return Kind == Immediate && - isUInt<6>(getImm()) && - (getImm() & 1) == 0; } - bool isU7Imm() const { return Kind == Immediate && isUInt<7>(getImm()); } - bool isU7ImmX4() const { return Kind == Immediate && - isUInt<7>(getImm()) && - (getImm() & 3) == 0; } - bool isU8Imm() const { return Kind == Immediate && isUInt<8>(getImm()); } - bool isU8ImmX8() const { return Kind == Immediate && - isUInt<8>(getImm()) && - (getImm() & 7) == 0; } - - bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); } - bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); } + + template <uint64_t N> bool isUImm() const { + return Kind == Immediate && isUInt<N>(getImm()); + } + template <uint64_t N> bool isSImm() const { + return Kind == Immediate && isInt<N>(getImm()); + } + bool isU6ImmX2() const { return isUImm<6>() && (getImm() & 1) == 0; } + bool isU7ImmX4() const { return isUImm<7>() && (getImm() & 3) == 0; } + bool isU8ImmX8() const { return isUImm<8>() && (getImm() & 7) == 0; } + bool isU16Imm() const { return isExtImm<16>(/*Signed*/ false, 1); } bool isS16Imm() const { return isExtImm<16>(/*Signed*/ true, 1); } bool isS16ImmX4() const { return isExtImm<16>(/*Signed*/ true, 4); } bool isS16ImmX16() const { return isExtImm<16>(/*Signed*/ true, 16); } bool isS17Imm() const { return isExtImm<17>(/*Signed*/ true, 1); } + bool isS34Imm() const { + // Once the PC-Rel ABI is finalized, evaluate whether a 34-bit + // ContextImmediate is needed. + return Kind == Expression || isSImm<34>(); + } + bool isS34ImmX16() const { + return Kind == Expression || (isSImm<34>() && (getImm() & 15) == 0); + } bool isHashImmX8() const { // The Hash Imm form is used for instructions that check or store a hash. @@ -384,16 +382,6 @@ public: (getImm() & 7) == 0); } - bool isS34ImmX16() const { - return Kind == Expression || - (Kind == Immediate && isInt<34>(getImm()) && (getImm() & 15) == 0); - } - bool isS34Imm() const { - // Once the PC-Rel ABI is finalized, evaluate whether a 34-bit - // ContextImmediate is needed. - return Kind == Expression || (Kind == Immediate && isInt<34>(getImm())); - } - bool isTLSReg() const { return Kind == TLSRegister; } bool isDirectBr() const { if (Kind == Expression) @@ -1637,7 +1625,7 @@ bool PPCAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, if (Operands.size() != 5) return false; PPCOperand &EHOp = (PPCOperand &)*Operands[4]; - if (EHOp.isU1Imm() && EHOp.getImm() == 0) + if (EHOp.isUImm<1>() && EHOp.getImm() == 0) Operands.pop_back(); } @@ -1817,7 +1805,7 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, } PPCOperand &Op = static_cast<PPCOperand &>(AsmOp); - if (Op.isU3Imm() && Op.getImm() == ImmVal) + if (Op.isUImm<3>() && Op.getImm() == ImmVal) return Match_Success; return Match_InvalidOperand; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 48c31c9..81d8e94 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -206,45 +206,24 @@ PPCMCCodeEmitter::getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo, return RegBits; } -unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpNo); - if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); - - // Add a fixup for the immediate field. - addFixup(Fixups, IsLittleEndian ? 0 : 2, MO.getExpr(), PPC::fixup_ppc_half16); - return 0; -} - -uint64_t PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI, - MCFixupKind Fixup) const { +template <MCFixupKind Fixup> +uint64_t PPCMCCodeEmitter::getImmEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { const MCOperand &MO = MI.getOperand(OpNo); assert(!MO.isReg() && "Not expecting a register for this operand."); if (MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + uint32_t Offset = 0; + if (Fixup == PPC::fixup_ppc_half16) + Offset = IsLittleEndian ? 0 : 2; + // Add a fixup for the immediate field. - addFixup(Fixups, 0, MO.getExpr(), Fixup); + addFixup(Fixups, Offset, MO.getExpr(), Fixup); return 0; } -uint64_t -PPCMCCodeEmitter::getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return getImm34Encoding(MI, OpNo, Fixups, STI, PPC::fixup_ppc_imm34); -} - -uint64_t -PPCMCCodeEmitter::getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return getImm34Encoding(MI, OpNo, Fixups, STI, PPC::fixup_ppc_pcrel34); -} - unsigned PPCMCCodeEmitter::getDispRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h index b574557..3356513 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -47,19 +47,10 @@ public: unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI, - MCFixupKind Fixup) const; - uint64_t getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - uint64_t getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + template <MCFixupKind Fixup> + uint64_t getImmEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getDispRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 60efa4c..fdca5ebc 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -14,30 +14,6 @@ //===----------------------------------------------------------------------===// // 64-bit operands. // -def s16imm64 : Operand<i64> { - let PrintMethod = "printS16ImmOperand"; - let EncoderMethod = "getImm16Encoding"; - let ParserMatchClass = PPCS16ImmAsmOperand; - let DecoderMethod = "decodeSImmOperand<16>"; - let OperandType = "OPERAND_IMMEDIATE"; -} -def u16imm64 : Operand<i64> { - let PrintMethod = "printU16ImmOperand"; - let EncoderMethod = "getImm16Encoding"; - let ParserMatchClass = PPCU16ImmAsmOperand; - let DecoderMethod = "decodeUImmOperand<16>"; - let OperandType = "OPERAND_IMMEDIATE"; -} -def s17imm64 : Operand<i64> { - // This operand type is used for addis/lis to allow the assembler parser - // to accept immediates in the range -65536..65535 for compatibility with - // the GNU assembler. The operand is treated as 16-bit otherwise. - let PrintMethod = "printS16ImmOperand"; - let EncoderMethod = "getImm16Encoding"; - let ParserMatchClass = PPCS17ImmAsmOperand; - let DecoderMethod = "decodeSImmOperand<16>"; - let OperandType = "OPERAND_IMMEDIATE"; -} def tocentry : Operand<iPTR> { let MIOperandInfo = (ops i64imm:$imm); } diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index c616db4..23d6d88 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -30,6 +30,11 @@ // Altivec transformation functions and pattern fragments. // +// fneg is not legal, and desugared as an xor. +def desugared_fneg : PatFrag<(ops node:$x), (v4f32 (bitconvert (xor (bitconvert $x), + (int_ppc_altivec_vslw (bitconvert (v16i8 immAllOnesV)), + (bitconvert (v16i8 immAllOnesV))))))>; + def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG); @@ -467,11 +472,12 @@ def VMADDFP : VAForm_1<46, (outs vrrc:$RT), (ins vrrc:$RA, vrrc:$RC, vrrc:$RB), [(set v4f32:$RT, (fma v4f32:$RA, v4f32:$RC, v4f32:$RB))]>; -// FIXME: The fma+fneg pattern won't match because fneg is not legal. +// fneg is not legal, hence we have to match on the desugared version. def VNMSUBFP: VAForm_1<47, (outs vrrc:$RT), (ins vrrc:$RA, vrrc:$RC, vrrc:$RB), "vnmsubfp $RT, $RA, $RC, $RB", IIC_VecFP, - [(set v4f32:$RT, (fneg (fma v4f32:$RA, v4f32:$RC, - (fneg v4f32:$RB))))]>; + [(set v4f32:$RT, (desugared_fneg (fma v4f32:$RA, v4f32:$RC, + (desugared_fneg v4f32:$RB))))]>; + let hasSideEffects = 1 in { def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>; def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs, @@ -892,6 +898,13 @@ def : Pat<(mul v8i16:$vA, v8i16:$vB), (VMLADDUHM $vA, $vB, (v8i16(V_SET0H)))>; // Add def : Pat<(add (mul v8i16:$vA, v8i16:$vB), v8i16:$vC), (VMLADDUHM $vA, $vB, $vC)>; + +// Fused negated multiply-subtract +def : Pat<(v4f32 (desugared_fneg + (int_ppc_altivec_vmaddfp v4f32:$RA, v4f32:$RC, + (desugared_fneg v4f32:$RB)))), + (VNMSUBFP $RA, $RC, $RB)>; + // Saturating adds/subtracts. def : Pat<(v16i8 (saddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDSBS $vA, $vB))>; def : Pat<(v16i8 (uaddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDUBS $vA, $vB))>; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 6d8c122..65d0484 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -615,7 +615,8 @@ def spe4rc : RegisterOperand<GPRC> { } def PPCU1ImmAsmOperand : AsmOperandClass { - let Name = "U1Imm"; let PredicateMethod = "isU1Imm"; + let Name = "U1Imm"; + let PredicateMethod = "isUImm<1>"; let RenderMethod = "addImmOperands"; } def u1imm : Operand<i32> { @@ -626,7 +627,8 @@ def u1imm : Operand<i32> { } def PPCU2ImmAsmOperand : AsmOperandClass { - let Name = "U2Imm"; let PredicateMethod = "isU2Imm"; + let Name = "U2Imm"; + let PredicateMethod = "isUImm<2>"; let RenderMethod = "addImmOperands"; } def u2imm : Operand<i32> { @@ -647,7 +649,8 @@ def atimm : Operand<i32> { } def PPCU3ImmAsmOperand : AsmOperandClass { - let Name = "U3Imm"; let PredicateMethod = "isU3Imm"; + let Name = "U3Imm"; + let PredicateMethod = "isUImm<3>"; let RenderMethod = "addImmOperands"; } def u3imm : Operand<i32> { @@ -658,7 +661,8 @@ def u3imm : Operand<i32> { } def PPCU4ImmAsmOperand : AsmOperandClass { - let Name = "U4Imm"; let PredicateMethod = "isU4Imm"; + let Name = "U4Imm"; + let PredicateMethod = "isUImm<4>"; let RenderMethod = "addImmOperands"; } def u4imm : Operand<i32> { @@ -668,7 +672,8 @@ def u4imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } def PPCS5ImmAsmOperand : AsmOperandClass { - let Name = "S5Imm"; let PredicateMethod = "isS5Imm"; + let Name = "S5Imm"; + let PredicateMethod = "isSImm<5>"; let RenderMethod = "addImmOperands"; } def s5imm : Operand<i32> { @@ -678,7 +683,8 @@ def s5imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } def PPCU5ImmAsmOperand : AsmOperandClass { - let Name = "U5Imm"; let PredicateMethod = "isU5Imm"; + let Name = "U5Imm"; + let PredicateMethod = "isUImm<5>"; let RenderMethod = "addImmOperands"; } def u5imm : Operand<i32> { @@ -688,7 +694,8 @@ def u5imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } def PPCU6ImmAsmOperand : AsmOperandClass { - let Name = "U6Imm"; let PredicateMethod = "isU6Imm"; + let Name = "U6Imm"; + let PredicateMethod = "isUImm<6>"; let RenderMethod = "addImmOperands"; } def u6imm : Operand<i32> { @@ -698,7 +705,8 @@ def u6imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } def PPCU7ImmAsmOperand : AsmOperandClass { - let Name = "U7Imm"; let PredicateMethod = "isU7Imm"; + let Name = "U7Imm"; + let PredicateMethod = "isUImm<7>"; let RenderMethod = "addImmOperands"; } def u7imm : Operand<i32> { @@ -708,7 +716,8 @@ def u7imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } def PPCU8ImmAsmOperand : AsmOperandClass { - let Name = "U8Imm"; let PredicateMethod = "isU8Imm"; + let Name = "U8Imm"; + let PredicateMethod = "isUImm<8>"; let RenderMethod = "addImmOperands"; } def u8imm : Operand<i32> { @@ -718,7 +727,8 @@ def u8imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } def PPCU10ImmAsmOperand : AsmOperandClass { - let Name = "U10Imm"; let PredicateMethod = "isU10Imm"; + let Name = "U10Imm"; + let PredicateMethod = "isUImm<10>"; let RenderMethod = "addImmOperands"; } def u10imm : Operand<i32> { @@ -728,7 +738,8 @@ def u10imm : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } def PPCU12ImmAsmOperand : AsmOperandClass { - let Name = "U12Imm"; let PredicateMethod = "isU12Imm"; + let Name = "U12Imm"; + let PredicateMethod = "isUImm<12>"; let RenderMethod = "addImmOperands"; } def u12imm : Operand<i32> { @@ -743,7 +754,14 @@ def PPCS16ImmAsmOperand : AsmOperandClass { } def s16imm : Operand<i32> { let PrintMethod = "printS16ImmOperand"; - let EncoderMethod = "getImm16Encoding"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>"; + let ParserMatchClass = PPCS16ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; +} +def s16imm64 : Operand<i64> { + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>"; let ParserMatchClass = PPCS16ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<16>"; let OperandType = "OPERAND_IMMEDIATE"; @@ -754,7 +772,14 @@ def PPCU16ImmAsmOperand : AsmOperandClass { } def u16imm : Operand<i32> { let PrintMethod = "printU16ImmOperand"; - let EncoderMethod = "getImm16Encoding"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>"; + let ParserMatchClass = PPCU16ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; +} +def u16imm64 : Operand<i64> { + let PrintMethod = "printU16ImmOperand"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>"; let ParserMatchClass = PPCU16ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<16>"; let OperandType = "OPERAND_IMMEDIATE"; @@ -768,7 +793,17 @@ def s17imm : Operand<i32> { // to accept immediates in the range -65536..65535 for compatibility with // the GNU assembler. The operand is treated as 16-bit otherwise. let PrintMethod = "printS16ImmOperand"; - let EncoderMethod = "getImm16Encoding"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>"; + let ParserMatchClass = PPCS17ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; +} +def s17imm64 : Operand<i64> { + // This operand type is used for addis/lis to allow the assembler parser + // to accept immediates in the range -65536..65535 for compatibility with + // the GNU assembler. The operand is treated as 16-bit otherwise. + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>"; let ParserMatchClass = PPCS17ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<16>"; let OperandType = "OPERAND_IMMEDIATE"; @@ -780,14 +815,14 @@ def PPCS34ImmAsmOperand : AsmOperandClass { } def s34imm : Operand<i64> { let PrintMethod = "printS34ImmOperand"; - let EncoderMethod = "getImm34EncodingNoPCRel"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_imm34>"; let ParserMatchClass = PPCS34ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<34>"; let OperandType = "OPERAND_IMMEDIATE"; } def s34imm_pcrel : Operand<i64> { let PrintMethod = "printS34ImmOperand"; - let EncoderMethod = "getImm34EncodingPCRel"; + let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_pcrel34>"; let ParserMatchClass = PPCS34ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<34>"; let OperandType = "OPERAND_IMMEDIATE"; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 34026ed..ecfb5fe 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -439,18 +439,6 @@ bool RISCVCallLowering::canLowerReturn(MachineFunction &MF, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, MF.getFunction().getContext()); - const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>(); - - std::optional<unsigned> FirstMaskArgument = std::nullopt; - // Preassign the first mask argument. - if (Subtarget.hasVInstructions()) { - for (const auto &ArgIdx : enumerate(Outs)) { - MVT ArgVT = MVT::getVT(ArgIdx.value().Ty); - if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1) - FirstMaskArgument = ArgIdx.index(); - } - } - for (unsigned I = 0, E = Outs.size(); I < E; ++I) { MVT VT = MVT::getVT(Outs[I].Ty); if (CC_RISCV(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo, diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp index 597dd12..9f9ae2f 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp @@ -324,6 +324,10 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = GPRValueMapping; + // Atomics always use GPR destinations. Don't refine any further. + if (cast<GLoad>(MI).isAtomic()) + break; + // Use FPR64 for s64 loads on rv32. if (GPRSize == 32 && Size.getFixedValue() == 64) { assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD()); @@ -358,6 +362,10 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = GPRValueMapping; + // Atomics always use GPR sources. Don't refine any further. + if (cast<GStore>(MI).isAtomic()) + break; + // Use FPR64 for s64 stores on rv32. if (GPRSize == 32 && Size.getFixedValue() == 64) { assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD()); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index a02de31..27cf057 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1421,7 +1421,7 @@ def HasVendorXMIPSCMov : Predicate<"Subtarget->hasVendorXMIPSCMov()">, AssemblerPredicate<(all_of FeatureVendorXMIPSCMov), "'Xmipscmov' ('mips.ccmov' instruction)">; -def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">; +def UseMIPSCCMovInsn : Predicate<"Subtarget->useMIPSCCMovInsn()">; def FeatureVendorXMIPSLSP : RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">; diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 7f5d0af..6d01250 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -190,3 +190,29 @@ let Predicates = [HasStdExtZbkb, NoStdExtZbb, IsRV64] in { def : Pat<(i64 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>; def : Pat<(i32 (zext (i16 GPR:$rs))), (PACKW GPR:$rs, (XLenVT X0))>; } + +//===----------------------------------------------------------------------===// +// Zalasr patterns not used by SelectionDAG +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtZalasr] in { + // the sequentially consistent loads use + // .aq instead of .aqrl to match the psABI/A.7 + def : PatLAQ<acquiring_load<atomic_load_aext_8>, LB_AQ, i16>; + def : PatLAQ<seq_cst_load<atomic_load_aext_8>, LB_AQ, i16>; + + def : PatLAQ<acquiring_load<atomic_load_nonext_16>, LH_AQ, i16>; + def : PatLAQ<seq_cst_load<atomic_load_nonext_16>, LH_AQ, i16>; + + def : PatSRL<releasing_store<atomic_store_8>, SB_RL, i16>; + def : PatSRL<seq_cst_store<atomic_store_8>, SB_RL, i16>; + + def : PatSRL<releasing_store<atomic_store_16>, SH_RL, i16>; + def : PatSRL<seq_cst_store<atomic_store_16>, SH_RL, i16>; +} + +let Predicates = [HasStdExtZalasr, IsRV64] in { + // Load pattern is in RISCVInstrInfoZalasr.td and shared with RV32. + def : PatSRL<releasing_store<atomic_store_32>, SW_RL, i32>; + def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL, i32>; +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index dcce2d2..a3a4cf2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -434,7 +434,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ABS, MVT::i32, Custom); } - if (!Subtarget.useCCMovInsn() && !Subtarget.hasVendorXTHeadCondMov()) + if (!Subtarget.useMIPSCCMovInsn() && !Subtarget.hasVendorXTHeadCondMov()) setOperationAction(ISD::SELECT, XLenVT, Custom); if (Subtarget.hasVendorXqcia() && !Subtarget.is64Bit()) { @@ -16498,43 +16498,60 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue X = N->getOperand(0); if (Subtarget.hasShlAdd(3)) { - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 2^N -> shl (shXadd X, X), N - if (isPowerOf2_64(MulAmt2)) { - SDLoc DL(N); - SDValue X = N->getOperand(0); - // Put the shift first if we can fold a zext into the - // shift forming a slli.uw. - if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && - X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), - Shl); - } - // Otherwise, put rhe shl second so that it can fold with following - // instructions (e.g. sext or add). - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(ISD::SHL, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2), DL, VT)); - } - - // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) { - SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT), - Mul359); + int Shift; + if (int ShXAmount = isShifted359(MulAmt, Shift)) { + // 3/5/9 * 2^N -> shl (shXadd X, X), N + SDLoc DL(N); + SDValue X = N->getOperand(0); + // Put the shift first if we can fold a zext into the shift forming + // a slli.uw. + if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && + X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) { + SDValue Shl = + DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, + DAG.getConstant(ShXAmount, DL, VT), Shl); } + // Otherwise, put the shl second so that it can fold with following + // instructions (e.g. sext or add). + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShXAmount, DL, VT), X); + return DAG.getNode(ISD::SHL, DL, VT, Mul359, + DAG.getConstant(Shift, DL, VT)); + } + + // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) + int ShX; + int ShY; + switch (MulAmt) { + case 3 * 5: + ShY = 1; + ShX = 2; + break; + case 3 * 9: + ShY = 1; + ShX = 3; + break; + case 5 * 5: + ShX = ShY = 2; + break; + case 5 * 9: + ShY = 2; + ShX = 3; + break; + case 9 * 9: + ShX = ShY = 3; + break; + default: + ShX = ShY = 0; + break; + } + if (ShX) { + SDLoc DL(N); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getConstant(ShX, DL, VT), Mul359); } // If this is a power 2 + 2/4/8, we can use a shift followed by a single @@ -16557,18 +16574,14 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // variants we could implement. e.g. // (2^(1,2,3) * 3,5,9 + 1) << C2 // 2^(C1>3) * 3,5,9 +/- 1 - for (uint64_t Divisor : {3, 5, 9}) { - uint64_t C = MulAmt - 1; - if (C <= Divisor) - continue; - unsigned TZ = llvm::countr_zero(C); - if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) { + if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) { + assert(Shift != 0 && "MulAmt=4,6,10 handled before"); + if (Shift <= 3) { SDLoc DL(N); - SDValue Mul359 = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShXAmount, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(TZ, DL, VT), X); + DAG.getConstant(Shift, DL, VT), X); } } @@ -16576,7 +16589,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { unsigned ScaleShift = llvm::countr_zero(MulAmt - 1); if (ScaleShift >= 1 && ScaleShift < 4) { - unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2))); + unsigned ShiftAmt = llvm::countr_zero((MulAmt - 1) & (MulAmt - 2)); SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); @@ -16589,7 +16602,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x)) for (uint64_t Offset : {3, 5, 9}) { if (isPowerOf2_64(MulAmt + Offset)) { - unsigned ShAmt = Log2_64(MulAmt + Offset); + unsigned ShAmt = llvm::countr_zero(MulAmt + Offset); if (ShAmt >= VT.getSizeInBits()) continue; SDLoc DL(N); @@ -16608,21 +16621,16 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt2 = MulAmt / Divisor; // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples // of 25 which happen to be quite common. - for (uint64_t Divisor2 : {3, 5, 9}) { - if (MulAmt2 % Divisor2 != 0) - continue; - uint64_t MulAmt3 = MulAmt2 / Divisor2; - if (isPowerOf2_64(MulAmt3)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = DAG.getNode( - RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(Log2_64(Divisor2 - 1), DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Log2_64(MulAmt3), DL, VT)); - } + if (int ShBAmount = isShifted359(MulAmt2, Shift)) { + SDLoc DL(N); + SDValue Mul359A = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); + SDValue Mul359B = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, + DAG.getConstant(ShBAmount, DL, VT), Mul359A); + return DAG.getNode(ISD::SHL, DL, VT, Mul359B, + DAG.getConstant(Shift, DL, VT)); } } } @@ -25031,8 +25039,17 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const { if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) { // Mark RVV intrinsic as supported. - if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID())) + if (RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(II->getIntrinsicID())) { + // GISel doesn't support tuple types yet. + if (Inst.getType()->isRISCVVectorTupleTy()) + return true; + + for (unsigned i = 0; i < II->arg_size(); ++i) + if (II->getArgOperand(i)->getType()->isRISCVVectorTupleTy()) + return true; + return false; + } } if (Inst.getType()->isScalableTy()) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 7db4832..96e1078 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4586,24 +4586,23 @@ void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(DestReg, RegState::Kill) .addImm(ShiftAmount) .setMIFlag(Flag); - } else if (STI.hasShlAdd(3) && - ((Amount % 3 == 0 && isPowerOf2_64(Amount / 3)) || - (Amount % 5 == 0 && isPowerOf2_64(Amount / 5)) || - (Amount % 9 == 0 && isPowerOf2_64(Amount / 9)))) { + } else if (int ShXAmount, ShiftAmount; + STI.hasShlAdd(3) && + (ShXAmount = isShifted359(Amount, ShiftAmount)) != 0) { // We can use Zba SHXADD+SLLI instructions for multiply in some cases. unsigned Opc; - uint32_t ShiftAmount; - if (Amount % 9 == 0) { - Opc = RISCV::SH3ADD; - ShiftAmount = Log2_64(Amount / 9); - } else if (Amount % 5 == 0) { - Opc = RISCV::SH2ADD; - ShiftAmount = Log2_64(Amount / 5); - } else if (Amount % 3 == 0) { + switch (ShXAmount) { + case 1: Opc = RISCV::SH1ADD; - ShiftAmount = Log2_64(Amount / 3); - } else { - llvm_unreachable("implied by if-clause"); + break; + case 2: + Opc = RISCV::SH2ADD; + break; + case 3: + Opc = RISCV::SH3ADD; + break; + default: + llvm_unreachable("unexpected result of isShifted359"); } if (ShiftAmount) BuildMI(MBB, II, DL, get(RISCV::SLLI), DestReg) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 42a0c4c..c5eddb9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -25,6 +25,25 @@ namespace llvm { +// If Value is of the form C1<<C2, where C1 = 3, 5 or 9, +// returns log2(C1 - 1) and assigns Shift = C2. +// Otherwise, returns 0. +template <typename T> int isShifted359(T Value, int &Shift) { + if (Value == 0) + return 0; + Shift = llvm::countr_zero(Value); + switch (Value >> Shift) { + case 3: + return 1; + case 5: + return 2; + case 9: + return 3; + default: + return 0; + } +} + class RISCVSubtarget; static const MachineMemOperand::Flags MONontemporalBit0 = diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td index 115ab38e..0b5bee1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td @@ -175,7 +175,7 @@ def MIPS_CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd), Sched<[]>; } -let Predicates = [UseCCMovInsn] in { +let Predicates = [UseMIPSCCMovInsn] in { def : Pat<(select (riscv_setne (XLenVT GPR:$rs2)), (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)), (MIPS_CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td index 1dd7332..1deecd2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td @@ -93,12 +93,11 @@ let Predicates = [HasStdExtZalasr] in { def : PatSRL<releasing_store<atomic_store_32>, SW_RL>; def : PatSRL<seq_cst_store<atomic_store_32>, SW_RL>; -} // Predicates = [HasStdExtZalasr] -let Predicates = [HasStdExtZalasr, IsRV32] in { - def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ>; - def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ>; -} // Predicates = [HasStdExtZalasr, IsRV32] + // Used by GISel for RV32 and RV64. + def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ, i32>; + def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ, i32>; +} // Predicates = [HasStdExtZalasr] let Predicates = [HasStdExtZalasr, IsRV64] in { def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ, i64>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index ce21d83..8d9b777 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -808,9 +808,9 @@ multiclass Sh2Add_UWPat<Instruction sh2add_uw> { } multiclass Sh3Add_UWPat<Instruction sh3add_uw> { - def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0xFFFFFFF8), + def : Pat<(i64 (add_like_non_imm12 (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), (XLenVT GPR:$rs2))), - (sh3add_uw (XLenVT (SRLIW GPR:$rs1, 3)), GPR:$rs2)>; + (sh3add_uw GPR:$rs1, GPR:$rs2)>; // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift. def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8), (XLenVT GPR:$rs2))), diff --git a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp index c81a20b..115a96e 100644 --- a/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp @@ -92,7 +92,7 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(Fn.getFunction())) return false; const RISCVSubtarget &Subtarget = Fn.getSubtarget<RISCVSubtarget>(); - if (!Subtarget.useLoadStorePairs()) + if (!Subtarget.useMIPSLoadStorePairs()) return false; bool MadeChange = false; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index e35ffaf..715ac4c 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -65,9 +65,9 @@ static cl::opt<bool> UseMIPSLoadStorePairsOpt( cl::desc("Enable the load/store pair optimization pass"), cl::init(false), cl::Hidden); -static cl::opt<bool> UseCCMovInsn("use-riscv-ccmov", - cl::desc("Use 'mips.ccmov' instruction"), - cl::init(true), cl::Hidden); +static cl::opt<bool> UseMIPSCCMovInsn("use-riscv-mips-ccmov", + cl::desc("Use 'mips.ccmov' instruction"), + cl::init(true), cl::Hidden); void RISCVSubtarget::anchor() {} @@ -246,10 +246,10 @@ void RISCVSubtarget::overridePostRASchedPolicy( } } -bool RISCVSubtarget::useLoadStorePairs() const { +bool RISCVSubtarget::useMIPSLoadStorePairs() const { return UseMIPSLoadStorePairsOpt && HasVendorXMIPSLSP; } -bool RISCVSubtarget::useCCMovInsn() const { - return UseCCMovInsn && HasVendorXMIPSCMov; +bool RISCVSubtarget::useMIPSCCMovInsn() const { + return UseMIPSCCMovInsn && HasVendorXMIPSCMov; } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 7dffa63..6acf799 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -227,8 +227,8 @@ public: unsigned getXLen() const { return is64Bit() ? 64 : 32; } - bool useLoadStorePairs() const; - bool useCCMovInsn() const; + bool useMIPSLoadStorePairs() const; + bool useMIPSCCMovInsn() const; unsigned getFLen() const { if (HasStdExtD) return 64; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index ee25f69..7bc0b5b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2747,20 +2747,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, Intrinsic::ID IID = Inst->getIntrinsicID(); LLVMContext &C = Inst->getContext(); bool HasMask = false; + + auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo, + bool IsWrite) -> int64_t { + if (auto *TarExtTy = + dyn_cast<TargetExtType>(II->getArgOperand(0)->getType())) + return TarExtTy->getIntParameter(0); + + return 1; + }; + switch (IID) { case Intrinsic::riscv_vle_mask: case Intrinsic::riscv_vse_mask: + case Intrinsic::riscv_vlseg2_mask: + case Intrinsic::riscv_vlseg3_mask: + case Intrinsic::riscv_vlseg4_mask: + case Intrinsic::riscv_vlseg5_mask: + case Intrinsic::riscv_vlseg6_mask: + case Intrinsic::riscv_vlseg7_mask: + case Intrinsic::riscv_vlseg8_mask: + case Intrinsic::riscv_vsseg2_mask: + case Intrinsic::riscv_vsseg3_mask: + case Intrinsic::riscv_vsseg4_mask: + case Intrinsic::riscv_vsseg5_mask: + case Intrinsic::riscv_vsseg6_mask: + case Intrinsic::riscv_vsseg7_mask: + case Intrinsic::riscv_vsseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vle: - case Intrinsic::riscv_vse: { + case Intrinsic::riscv_vse: + case Intrinsic::riscv_vlseg2: + case Intrinsic::riscv_vlseg3: + case Intrinsic::riscv_vlseg4: + case Intrinsic::riscv_vlseg5: + case Intrinsic::riscv_vlseg6: + case Intrinsic::riscv_vlseg7: + case Intrinsic::riscv_vlseg8: + case Intrinsic::riscv_vsseg2: + case Intrinsic::riscv_vsseg3: + case Intrinsic::riscv_vsseg4: + case Intrinsic::riscv_vsseg5: + case Intrinsic::riscv_vsseg6: + case Intrinsic::riscv_vsseg7: + case Intrinsic::riscv_vsseg8: { // Intrinsic interface: // riscv_vle(merge, ptr, vl) // riscv_vle_mask(merge, ptr, mask, vl, policy) // riscv_vse(val, ptr, vl) // riscv_vse_mask(val, ptr, mask, vl, policy) + // riscv_vlseg#(merge, ptr, vl, sew) + // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew) + // riscv_vsseg#(val, ptr, vl, sew) + // riscv_vsseg#_mask(val, ptr, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 1 - HasMask; @@ -2771,23 +2823,72 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, if (HasMask) Mask = Inst->getArgOperand(VLIndex - 1); Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Alignment, Mask, EVL); return true; } case Intrinsic::riscv_vlse_mask: case Intrinsic::riscv_vsse_mask: + case Intrinsic::riscv_vlsseg2_mask: + case Intrinsic::riscv_vlsseg3_mask: + case Intrinsic::riscv_vlsseg4_mask: + case Intrinsic::riscv_vlsseg5_mask: + case Intrinsic::riscv_vlsseg6_mask: + case Intrinsic::riscv_vlsseg7_mask: + case Intrinsic::riscv_vlsseg8_mask: + case Intrinsic::riscv_vssseg2_mask: + case Intrinsic::riscv_vssseg3_mask: + case Intrinsic::riscv_vssseg4_mask: + case Intrinsic::riscv_vssseg5_mask: + case Intrinsic::riscv_vssseg6_mask: + case Intrinsic::riscv_vssseg7_mask: + case Intrinsic::riscv_vssseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vlse: - case Intrinsic::riscv_vsse: { + case Intrinsic::riscv_vsse: + case Intrinsic::riscv_vlsseg2: + case Intrinsic::riscv_vlsseg3: + case Intrinsic::riscv_vlsseg4: + case Intrinsic::riscv_vlsseg5: + case Intrinsic::riscv_vlsseg6: + case Intrinsic::riscv_vlsseg7: + case Intrinsic::riscv_vlsseg8: + case Intrinsic::riscv_vssseg2: + case Intrinsic::riscv_vssseg3: + case Intrinsic::riscv_vssseg4: + case Intrinsic::riscv_vssseg5: + case Intrinsic::riscv_vssseg6: + case Intrinsic::riscv_vssseg7: + case Intrinsic::riscv_vssseg8: { // Intrinsic interface: // riscv_vlse(merge, ptr, stride, vl) // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy) // riscv_vsse(val, ptr, stride, vl) // riscv_vsse_mask(val, ptr, stride, mask, vl, policy) + // riscv_vlsseg#(merge, ptr, offset, vl, sew) + // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew) + // riscv_vssseg#(val, ptr, offset, vl, sew) + // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 2 - HasMask; @@ -2809,6 +2910,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, if (HasMask) Mask = Inst->getArgOperand(VLIndex - 1); Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Alignment, Mask, EVL, Stride); return true; @@ -2817,19 +2925,89 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::riscv_vluxei_mask: case Intrinsic::riscv_vsoxei_mask: case Intrinsic::riscv_vsuxei_mask: + case Intrinsic::riscv_vloxseg2_mask: + case Intrinsic::riscv_vloxseg3_mask: + case Intrinsic::riscv_vloxseg4_mask: + case Intrinsic::riscv_vloxseg5_mask: + case Intrinsic::riscv_vloxseg6_mask: + case Intrinsic::riscv_vloxseg7_mask: + case Intrinsic::riscv_vloxseg8_mask: + case Intrinsic::riscv_vluxseg2_mask: + case Intrinsic::riscv_vluxseg3_mask: + case Intrinsic::riscv_vluxseg4_mask: + case Intrinsic::riscv_vluxseg5_mask: + case Intrinsic::riscv_vluxseg6_mask: + case Intrinsic::riscv_vluxseg7_mask: + case Intrinsic::riscv_vluxseg8_mask: + case Intrinsic::riscv_vsoxseg2_mask: + case Intrinsic::riscv_vsoxseg3_mask: + case Intrinsic::riscv_vsoxseg4_mask: + case Intrinsic::riscv_vsoxseg5_mask: + case Intrinsic::riscv_vsoxseg6_mask: + case Intrinsic::riscv_vsoxseg7_mask: + case Intrinsic::riscv_vsoxseg8_mask: + case Intrinsic::riscv_vsuxseg2_mask: + case Intrinsic::riscv_vsuxseg3_mask: + case Intrinsic::riscv_vsuxseg4_mask: + case Intrinsic::riscv_vsuxseg5_mask: + case Intrinsic::riscv_vsuxseg6_mask: + case Intrinsic::riscv_vsuxseg7_mask: + case Intrinsic::riscv_vsuxseg8_mask: HasMask = true; [[fallthrough]]; case Intrinsic::riscv_vloxei: case Intrinsic::riscv_vluxei: case Intrinsic::riscv_vsoxei: - case Intrinsic::riscv_vsuxei: { + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vloxseg2: + case Intrinsic::riscv_vloxseg3: + case Intrinsic::riscv_vloxseg4: + case Intrinsic::riscv_vloxseg5: + case Intrinsic::riscv_vloxseg6: + case Intrinsic::riscv_vloxseg7: + case Intrinsic::riscv_vloxseg8: + case Intrinsic::riscv_vluxseg2: + case Intrinsic::riscv_vluxseg3: + case Intrinsic::riscv_vluxseg4: + case Intrinsic::riscv_vluxseg5: + case Intrinsic::riscv_vluxseg6: + case Intrinsic::riscv_vluxseg7: + case Intrinsic::riscv_vluxseg8: + case Intrinsic::riscv_vsoxseg2: + case Intrinsic::riscv_vsoxseg3: + case Intrinsic::riscv_vsoxseg4: + case Intrinsic::riscv_vsoxseg5: + case Intrinsic::riscv_vsoxseg6: + case Intrinsic::riscv_vsoxseg7: + case Intrinsic::riscv_vsoxseg8: + case Intrinsic::riscv_vsuxseg2: + case Intrinsic::riscv_vsuxseg3: + case Intrinsic::riscv_vsuxseg4: + case Intrinsic::riscv_vsuxseg5: + case Intrinsic::riscv_vsuxseg6: + case Intrinsic::riscv_vsuxseg7: + case Intrinsic::riscv_vsuxseg8: { // Intrinsic interface (only listed ordered version): // riscv_vloxei(merge, ptr, index, vl) // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy) // riscv_vsoxei(val, ptr, index, vl) // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy) + // riscv_vloxseg#(merge, ptr, index, vl, sew) + // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew) + // riscv_vsoxseg#(val, ptr, index, vl, sew) + // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew) bool IsWrite = Inst->getType()->isVoidTy(); Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + // The results of segment loads are TargetExtType. + if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) { + unsigned SEW = + 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1)) + ->getZExtValue(); + Ty = TarExtTy->getTypeParameter(0U); + Ty = ScalableVectorType::get( + IntegerType::get(C, SEW), + cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW); + } const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); unsigned VLIndex = RVVIInfo->VLOperand; unsigned PtrOperandNo = VLIndex - 2 - HasMask; @@ -2845,6 +3023,13 @@ bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, Mask = ConstantInt::getTrue(MaskType); } Value *EVL = Inst->getArgOperand(VLIndex); + unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite); + // RVV uses contiguous elements as a segment. + if (SegNum > 1) { + unsigned ElemSize = Ty->getScalarSizeInBits(); + auto *SegTy = IntegerType::get(C, ElemSize * SegNum); + Ty = VectorType::get(SegTy, cast<VectorType>(Ty)); + } Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1); Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, Align(1), Mask, EVL, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 9f2e075..e16c8f0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -2811,9 +2811,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { GetElementPtrInst *NewGEP = simplifyZeroLengthArrayGepInst(Ref); if (NewGEP) { Ref->replaceAllUsesWith(NewGEP); - if (isInstructionTriviallyDead(Ref)) - DeadInsts.insert(Ref); - + DeadInsts.insert(Ref); Ref = NewGEP; } if (Type *GepTy = getGEPType(Ref)) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 0afec42..989950f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -307,6 +307,10 @@ private: bool selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectCounterHandleFromBinding(Register &ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const; + bool selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const; bool selectImageWriteIntrinsic(MachineInstr &I) const; @@ -314,6 +318,8 @@ private: MachineInstr &I) const; bool selectModf(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectUpdateCounter(Register &ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; bool selectFrexp(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; // Utilities @@ -3443,6 +3449,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_resource_handlefrombinding: { return selectHandleFromBinding(ResVReg, ResType, I); } + case Intrinsic::spv_resource_counterhandlefrombinding: + return selectCounterHandleFromBinding(ResVReg, ResType, I); + case Intrinsic::spv_resource_updatecounter: + return selectUpdateCounter(ResVReg, ResType, I); case Intrinsic::spv_resource_store_typedbuffer: { return selectImageWriteIntrinsic(I); } @@ -3478,6 +3488,130 @@ bool SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg, *cast<GIntrinsic>(&I), I); } +bool SPIRVInstructionSelector::selectCounterHandleFromBinding( + Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const { + auto &Intr = cast<GIntrinsic>(I); + assert(Intr.getIntrinsicID() == + Intrinsic::spv_resource_counterhandlefrombinding); + + // Extract information from the intrinsic call. + Register MainHandleReg = Intr.getOperand(2).getReg(); + auto *MainHandleDef = cast<GIntrinsic>(getVRegDef(*MRI, MainHandleReg)); + assert(MainHandleDef->getIntrinsicID() == + Intrinsic::spv_resource_handlefrombinding); + + uint32_t Set = getIConstVal(Intr.getOperand(4).getReg(), MRI); + uint32_t Binding = getIConstVal(Intr.getOperand(3).getReg(), MRI); + uint32_t ArraySize = getIConstVal(MainHandleDef->getOperand(4).getReg(), MRI); + Register IndexReg = MainHandleDef->getOperand(5).getReg(); + const bool IsNonUniform = false; + std::string CounterName = + getStringValueFromReg(MainHandleDef->getOperand(6).getReg(), *MRI) + + ".counter"; + + // Create the counter variable. + MachineIRBuilder MIRBuilder(I); + Register CounterVarReg = buildPointerToResource( + GR.getPointeeType(ResType), GR.getPointerStorageClass(ResType), Set, + Binding, ArraySize, IndexReg, IsNonUniform, CounterName, MIRBuilder); + + return BuildCOPY(ResVReg, CounterVarReg, I); +} + +bool SPIRVInstructionSelector::selectUpdateCounter(Register &ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + auto &Intr = cast<GIntrinsic>(I); + assert(Intr.getIntrinsicID() == Intrinsic::spv_resource_updatecounter); + + Register CounterHandleReg = Intr.getOperand(2).getReg(); + Register IncrReg = Intr.getOperand(3).getReg(); + + // The counter handle is a pointer to the counter variable (which is a struct + // containing an i32). We need to get a pointer to that i32 member to do the + // atomic operation. +#ifndef NDEBUG + SPIRVType *CounterVarType = GR.getSPIRVTypeForVReg(CounterHandleReg); + SPIRVType *CounterVarPointeeType = GR.getPointeeType(CounterVarType); + assert(CounterVarPointeeType && + CounterVarPointeeType->getOpcode() == SPIRV::OpTypeStruct && + "Counter variable must be a struct"); + assert(GR.getPointerStorageClass(CounterVarType) == + SPIRV::StorageClass::StorageBuffer && + "Counter variable must be in the storage buffer storage class"); + assert(CounterVarPointeeType->getNumOperands() == 2 && + "Counter variable must have exactly 1 member in the struct"); + const SPIRVType *MemberType = + GR.getSPIRVTypeForVReg(CounterVarPointeeType->getOperand(1).getReg()); + assert(MemberType->getOpcode() == SPIRV::OpTypeInt && + "Counter variable struct must have a single i32 member"); +#endif + + // The struct has a single i32 member. + MachineIRBuilder MIRBuilder(I); + const Type *LLVMIntType = + Type::getInt32Ty(I.getMF()->getFunction().getContext()); + + SPIRVType *IntPtrType = GR.getOrCreateSPIRVPointerType( + LLVMIntType, MIRBuilder, SPIRV::StorageClass::StorageBuffer); + + auto Zero = buildI32Constant(0, I); + if (!Zero.second) + return false; + + Register PtrToCounter = + MRI->createVirtualRegister(GR.getRegClass(IntPtrType)); + if (!BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpAccessChain)) + .addDef(PtrToCounter) + .addUse(GR.getSPIRVTypeID(IntPtrType)) + .addUse(CounterHandleReg) + .addUse(Zero.first) + .constrainAllUses(TII, TRI, RBI)) { + return false; + } + + // For UAV/SSBO counters, the scope is Device. The counter variable is not + // used as a flag. So the memory semantics can be None. + auto Scope = buildI32Constant(SPIRV::Scope::Device, I); + if (!Scope.second) + return false; + auto Semantics = buildI32Constant(SPIRV::MemorySemantics::None, I); + if (!Semantics.second) + return false; + + int64_t IncrVal = getIConstValSext(IncrReg, MRI); + auto Incr = buildI32Constant(static_cast<uint32_t>(IncrVal), I); + if (!Incr.second) + return false; + + Register AtomicRes = MRI->createVirtualRegister(GR.getRegClass(ResType)); + if (!BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpAtomicIAdd)) + .addDef(AtomicRes) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(PtrToCounter) + .addUse(Scope.first) + .addUse(Semantics.first) + .addUse(Incr.first) + .constrainAllUses(TII, TRI, RBI)) { + return false; + } + if (IncrVal >= 0) { + return BuildCOPY(ResVReg, AtomicRes, I); + } + + // In HLSL, IncrementCounter returns the value *before* the increment, while + // DecrementCounter returns the value *after* the decrement. Both are lowered + // to the same atomic intrinsic which returns the value *before* the + // operation. So for decrements (negative IncrVal), we must subtract the + // increment value from the result to get the post-decrement value. + return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(AtomicRes) + .addUse(Incr.first) + .constrainAllUses(TII, TRI, RBI); +} bool SPIRVInstructionSelector::selectReadImageIntrinsic( Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const { diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp index 205895e..fc14a03 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -39,6 +39,10 @@ private: void collectBindingInfo(Module &M); uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet); void replaceImplicitBindingCalls(Module &M); + void replaceResourceHandleCall(Module &M, CallInst *OldCI, + uint32_t NewBinding); + void replaceCounterHandleCall(Module &M, CallInst *OldCI, + uint32_t NewBinding); void verifyUniqueOrderIdPerResource(SmallVectorImpl<CallInst *> &Calls); // A map from descriptor set to a bit vector of used binding numbers. @@ -56,64 +60,93 @@ struct BindingInfoCollector : public InstVisitor<BindingInfoCollector> { : UsedBindings(UsedBindings), ImplicitBindingCalls(ImplicitBindingCalls) { } + void addBinding(uint32_t DescSet, uint32_t Binding) { + if (UsedBindings.size() <= DescSet) { + UsedBindings.resize(DescSet + 1); + UsedBindings[DescSet].resize(64); + } + if (UsedBindings[DescSet].size() <= Binding) { + UsedBindings[DescSet].resize(2 * Binding + 1); + } + UsedBindings[DescSet].set(Binding); + } + void visitCallInst(CallInst &CI) { if (CI.getIntrinsicID() == Intrinsic::spv_resource_handlefrombinding) { const uint32_t DescSet = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); const uint32_t Binding = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); - - if (UsedBindings.size() <= DescSet) { - UsedBindings.resize(DescSet + 1); - UsedBindings[DescSet].resize(64); - } - if (UsedBindings[DescSet].size() <= Binding) { - UsedBindings[DescSet].resize(2 * Binding + 1); - } - UsedBindings[DescSet].set(Binding); + addBinding(DescSet, Binding); } else if (CI.getIntrinsicID() == Intrinsic::spv_resource_handlefromimplicitbinding) { ImplicitBindingCalls.push_back(&CI); + } else if (CI.getIntrinsicID() == + Intrinsic::spv_resource_counterhandlefrombinding) { + const uint32_t DescSet = + cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); + const uint32_t Binding = + cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); + addBinding(DescSet, Binding); + } else if (CI.getIntrinsicID() == + Intrinsic::spv_resource_counterhandlefromimplicitbinding) { + ImplicitBindingCalls.push_back(&CI); } } }; +static uint32_t getOrderId(const CallInst *CI) { + uint32_t OrderIdArgIdx = 0; + switch (CI->getIntrinsicID()) { + case Intrinsic::spv_resource_handlefromimplicitbinding: + OrderIdArgIdx = 0; + break; + case Intrinsic::spv_resource_counterhandlefromimplicitbinding: + OrderIdArgIdx = 1; + break; + default: + llvm_unreachable("CallInst is not an implicit binding intrinsic"); + } + return cast<ConstantInt>(CI->getArgOperand(OrderIdArgIdx))->getZExtValue(); +} + +static uint32_t getDescSet(const CallInst *CI) { + uint32_t DescSetArgIdx; + switch (CI->getIntrinsicID()) { + case Intrinsic::spv_resource_handlefromimplicitbinding: + case Intrinsic::spv_resource_handlefrombinding: + DescSetArgIdx = 1; + break; + case Intrinsic::spv_resource_counterhandlefromimplicitbinding: + case Intrinsic::spv_resource_counterhandlefrombinding: + DescSetArgIdx = 2; + break; + default: + llvm_unreachable("CallInst is not an implicit binding intrinsic"); + } + return cast<ConstantInt>(CI->getArgOperand(DescSetArgIdx))->getZExtValue(); +} + void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) { BindingInfoCollector InfoCollector(UsedBindings, ImplicitBindingCalls); InfoCollector.visit(M); // Sort the collected calls by their order ID. - std::sort( - ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(), - [](const CallInst *A, const CallInst *B) { - const uint32_t OrderIdArgIdx = 0; - const uint32_t OrderA = - cast<ConstantInt>(A->getArgOperand(OrderIdArgIdx))->getZExtValue(); - const uint32_t OrderB = - cast<ConstantInt>(B->getArgOperand(OrderIdArgIdx))->getZExtValue(); - return OrderA < OrderB; - }); + std::sort(ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(), + [](const CallInst *A, const CallInst *B) { + return getOrderId(A) < getOrderId(B); + }); } void SPIRVLegalizeImplicitBinding::verifyUniqueOrderIdPerResource( SmallVectorImpl<CallInst *> &Calls) { // Check that the order Id is unique per resource. for (uint32_t i = 1; i < Calls.size(); ++i) { - const uint32_t OrderIdArgIdx = 0; - const uint32_t DescSetArgIdx = 1; - const uint32_t OrderA = - cast<ConstantInt>(Calls[i - 1]->getArgOperand(OrderIdArgIdx)) - ->getZExtValue(); - const uint32_t OrderB = - cast<ConstantInt>(Calls[i]->getArgOperand(OrderIdArgIdx)) - ->getZExtValue(); + const uint32_t OrderA = getOrderId(Calls[i - 1]); + const uint32_t OrderB = getOrderId(Calls[i]); if (OrderA == OrderB) { - const uint32_t DescSetA = - cast<ConstantInt>(Calls[i - 1]->getArgOperand(DescSetArgIdx)) - ->getZExtValue(); - const uint32_t DescSetB = - cast<ConstantInt>(Calls[i]->getArgOperand(DescSetArgIdx)) - ->getZExtValue(); + const uint32_t DescSetA = getDescSet(Calls[i - 1]); + const uint32_t DescSetB = getDescSet(Calls[i]); if (DescSetA != DescSetB) { report_fatal_error("Implicit binding calls with the same order ID must " "have the same descriptor set"); @@ -144,36 +177,26 @@ void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) { uint32_t lastBindingNumber = -1; for (CallInst *OldCI : ImplicitBindingCalls) { - IRBuilder<> Builder(OldCI); - const uint32_t OrderId = - cast<ConstantInt>(OldCI->getArgOperand(0))->getZExtValue(); - const uint32_t DescSet = - cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue(); - - // Reuse an existing binding for this order ID, if one was already assigned. - // Otherwise, assign a new binding. - const uint32_t NewBinding = (lastOrderId == OrderId) - ? lastBindingNumber - : getAndReserveFirstUnusedBinding(DescSet); - lastOrderId = OrderId; - lastBindingNumber = NewBinding; - - SmallVector<Value *, 8> Args; - Args.push_back(Builder.getInt32(DescSet)); - Args.push_back(Builder.getInt32(NewBinding)); - - // Copy the remaining arguments from the old call. - for (uint32_t i = 2; i < OldCI->arg_size(); ++i) { - Args.push_back(OldCI->getArgOperand(i)); + const uint32_t OrderId = getOrderId(OldCI); + uint32_t BindingNumber; + if (OrderId == lastOrderId) { + BindingNumber = lastBindingNumber; + } else { + const uint32_t DescSet = getDescSet(OldCI); + BindingNumber = getAndReserveFirstUnusedBinding(DescSet); } - Function *NewFunc = Intrinsic::getOrInsertDeclaration( - &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType()); - CallInst *NewCI = Builder.CreateCall(NewFunc, Args); - NewCI->setCallingConv(OldCI->getCallingConv()); - - OldCI->replaceAllUsesWith(NewCI); - OldCI->eraseFromParent(); + if (OldCI->getIntrinsicID() == + Intrinsic::spv_resource_handlefromimplicitbinding) { + replaceResourceHandleCall(M, OldCI, BindingNumber); + } else { + assert(OldCI->getIntrinsicID() == + Intrinsic::spv_resource_counterhandlefromimplicitbinding && + "Unexpected implicit binding intrinsic"); + replaceCounterHandleCall(M, OldCI, BindingNumber); + } + lastOrderId = OrderId; + lastBindingNumber = BindingNumber; } } @@ -196,4 +219,49 @@ INITIALIZE_PASS(SPIRVLegalizeImplicitBinding, "legalize-spirv-implicit-binding", ModulePass *llvm::createSPIRVLegalizeImplicitBindingPass() { return new SPIRVLegalizeImplicitBinding(); -}
\ No newline at end of file +} + +void SPIRVLegalizeImplicitBinding::replaceResourceHandleCall( + Module &M, CallInst *OldCI, uint32_t NewBinding) { + IRBuilder<> Builder(OldCI); + const uint32_t DescSet = + cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue(); + + SmallVector<Value *, 8> Args; + Args.push_back(Builder.getInt32(DescSet)); + Args.push_back(Builder.getInt32(NewBinding)); + + // Copy the remaining arguments from the old call. + for (uint32_t i = 2; i < OldCI->arg_size(); ++i) { + Args.push_back(OldCI->getArgOperand(i)); + } + + Function *NewFunc = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType()); + CallInst *NewCI = Builder.CreateCall(NewFunc, Args); + NewCI->setCallingConv(OldCI->getCallingConv()); + + OldCI->replaceAllUsesWith(NewCI); + OldCI->eraseFromParent(); +} + +void SPIRVLegalizeImplicitBinding::replaceCounterHandleCall( + Module &M, CallInst *OldCI, uint32_t NewBinding) { + IRBuilder<> Builder(OldCI); + const uint32_t DescSet = + cast<ConstantInt>(OldCI->getArgOperand(2))->getZExtValue(); + + SmallVector<Value *, 8> Args; + Args.push_back(OldCI->getArgOperand(0)); + Args.push_back(Builder.getInt32(NewBinding)); + Args.push_back(Builder.getInt32(DescSet)); + + Type *Tys[] = {OldCI->getType(), OldCI->getArgOperand(0)->getType()}; + Function *NewFunc = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::spv_resource_counterhandlefrombinding, Tys); + CallInst *NewCI = Builder.CreateCall(NewFunc, Args); + NewCI->setCallingConv(OldCI->getCallingConv()); + + OldCI->replaceAllUsesWith(NewCI); + OldCI->eraseFromParent(); +} diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 327c011..1d47c89 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -385,6 +385,12 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) { return MI->getOperand(1).getCImm()->getValue().getZExtValue(); } +int64_t getIConstValSext(Register ConstReg, const MachineRegisterInfo *MRI) { + const MachineInstr *MI = getDefInstrMaybeConstant(ConstReg, MRI); + assert(MI && MI->getOpcode() == TargetOpcode::G_CONSTANT); + return MI->getOperand(1).getCImm()->getSExtValue(); +} + bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID) { if (const auto *GI = dyn_cast<GIntrinsic>(&MI)) return GI->is(IntrinsicID); diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index 409a0fd..5777a24 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -289,6 +289,9 @@ MachineInstr *getDefInstrMaybeConstant(Register &ConstReg, // Get constant integer value of the given ConstReg. uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI); +// Get constant integer value of the given ConstReg, sign-extended. +int64_t getIConstValSext(Register ConstReg, const MachineRegisterInfo *MRI); + // Check if MI is a SPIR-V specific intrinsic call. bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID); // Check if it's a SPIR-V specific intrinsic call. diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index 3090ad3..27fba34 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -407,6 +407,7 @@ bool X86InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_TRUNC: return selectTruncOrPtrToInt(I, MRI, MF); case TargetOpcode::G_INTTOPTR: + case TargetOpcode::G_FREEZE: return selectCopy(I, MRI); case TargetOpcode::G_ZEXT: return selectZext(I, MRI, MF); diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index e7709ef..11ef721 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -89,9 +89,29 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, // 32/64-bits needs support for s64/s128 to handle cases: // s64 = EXTEND (G_IMPLICIT_DEF s32) -> s64 = G_IMPLICIT_DEF // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF - getActionDefinitionsBuilder(G_IMPLICIT_DEF) + getActionDefinitionsBuilder( + {G_IMPLICIT_DEF, G_PHI, G_FREEZE, G_CONSTANT_FOLD_BARRIER}) .legalFor({p0, s1, s8, s16, s32, s64}) - .legalFor(Is64Bit, {s128}); + .legalFor(UseX87, {s80}) + .legalFor(Is64Bit, {s128}) + .legalFor(HasSSE2, {v16s8, v8s16, v4s32, v2s64}) + .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64}) + .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64}) + .widenScalarOrEltToNextPow2(0, /*Min=*/8) + .clampScalarOrElt(0, s8, sMaxScalar) + .moreElementsToNextPow2(0) + .clampMinNumElements(0, s8, 16) + .clampMinNumElements(0, s16, 8) + .clampMinNumElements(0, s32, 4) + .clampMinNumElements(0, s64, 2) + .clampMaxNumElements(0, s8, HasAVX512 ? 64 : (HasAVX ? 32 : 16)) + .clampMaxNumElements(0, s16, HasAVX512 ? 32 : (HasAVX ? 16 : 8)) + .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX ? 8 : 4)) + .clampMaxNumElements(0, s64, HasAVX512 ? 8 : (HasAVX ? 4 : 2)) + .clampMaxNumElements(0, p0, + Is64Bit ? s64MaxVector.getNumElements() + : s32MaxVector.getNumElements()) + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); getActionDefinitionsBuilder(G_CONSTANT) .legalFor({p0, s8, s16, s32}) @@ -289,26 +309,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .clampScalar(1, s16, sMaxScalar) .scalarSameSizeAs(0, 1); - // control flow - getActionDefinitionsBuilder(G_PHI) - .legalFor({s8, s16, s32, p0}) - .legalFor(UseX87, {s80}) - .legalFor(Is64Bit, {s64}) - .legalFor(HasSSE1, {v16s8, v8s16, v4s32, v2s64}) - .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64}) - .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64}) - .clampMinNumElements(0, s8, 16) - .clampMinNumElements(0, s16, 8) - .clampMinNumElements(0, s32, 4) - .clampMinNumElements(0, s64, 2) - .clampMaxNumElements(0, s8, HasAVX512 ? 64 : (HasAVX ? 32 : 16)) - .clampMaxNumElements(0, s16, HasAVX512 ? 32 : (HasAVX ? 16 : 8)) - .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX ? 8 : 4)) - .clampMaxNumElements(0, s64, HasAVX512 ? 8 : (HasAVX ? 4 : 2)) - .widenScalarToNextPow2(0, /*Min=*/32) - .clampScalar(0, s8, sMaxScalar) - .scalarize(0); - getActionDefinitionsBuilder(G_BRCOND).legalFor({s1}); // pointer handling @@ -592,11 +592,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .minScalar(0, LLT::scalar(32)) .libcall(); - getActionDefinitionsBuilder({G_FREEZE, G_CONSTANT_FOLD_BARRIER}) - .legalFor({s8, s16, s32, s64, p0}) - .widenScalarToNextPow2(0, /*Min=*/8) - .clampScalar(0, s8, sMaxScalar); - getLegacyLegalizerInfo().computeTables(); verify(*STI.getInstrInfo()); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 564810c..83bd6ac 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -662,6 +662,7 @@ def VINSERTPSZrri : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>; +let mayLoad = 1 in def VINSERTPSZrmi : AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -1293,6 +1294,7 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + let hasSideEffects = 0, mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", (_Dst.VT (OpNode addr:$src))>, @@ -1748,6 +1750,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>, EVEX, VVVV, AVX5128IBase, Sched<[sched]>; + let hasSideEffects = 0, mayLoad = 1 in defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins IdxVT.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", @@ -1759,7 +1762,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo IdxVT> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, mayLoad = 1 in defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), @@ -1987,6 +1990,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, _.FRC:$src2, timm:$cc))]>, EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC; + let mayLoad = 1 in def rmi : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), @@ -2145,6 +2149,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag, (_.VT _.RC:$src2), cond)))]>, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1 in def rmi : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, @@ -2167,6 +2172,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag, (_.VT _.RC:$src2), cond))))]>, EVEX, VVVV, EVEX_K, Sched<[sched]>; + let mayLoad = 1 in def rmik : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, u8imm:$cc), @@ -2198,6 +2204,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, PatFrag Frag_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> : avx512_icmp_cc<opc, Suffix, Frag, Frag_su, sched, _, Name> { + let mayLoad = 1 in { def rmbi : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), @@ -2221,6 +2228,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, (_.BroadcastLdFrag addr:$src2), cond))))]>, EVEX, VVVV, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } def : Pat<(_.KVT (Frag:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond)), @@ -2305,6 +2313,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in { (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), 1>, Sched<[sched]>; + let mayLoad = 1 in { defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, @@ -2329,6 +2338,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in { timm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } + } // Patterns for selecting with loads in other operand. def : Pat<(X86any_cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), @@ -3771,6 +3781,7 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src [(set VR128X:$dst, (v4i32 (scalar_to_vector GR32:$src)))]>, EVEX, Sched<[WriteVecMoveFromGpr]>; +let mayLoad = 1 in def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, @@ -3874,7 +3885,7 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), // Move Quadword Int to Packed Quadword Int // -let ExeDomain = SSEPackedInt in { +let ExeDomain = SSEPackedInt, mayLoad = 1, hasSideEffects = 0 in { def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -3930,13 +3941,13 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag, (_.VT (OpNode _.RC:$src1, _.RC:$src2)), (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, VVVV, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>; - let canFoldAsLoad = 1, isReMaterializable = 1 in { + let canFoldAsLoad = 1, isReMaterializable = 1, mayLoad = 1, hasSideEffects = 0 in { def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))], _.ExeDomain>, EVEX, Sched<[WriteFLoad]>; // _alt version uses FR32/FR64 register class. - let isCodeGenOnly = 1 in + let isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in def rm_alt : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], @@ -4557,6 +4568,7 @@ let Predicates = [HasAVX512] in { // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// +let mayLoad = 1, hasSideEffects = 0 in { def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [], SSEPackedInt>, Sched<[SchedWriteVecMoveLSNT.ZMM.RM]>, @@ -4575,11 +4587,12 @@ let Predicates = [HasVLX] in { [], SSEPackedInt>, Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, EVEX, T8, PD, EVEX_V128, EVEX_CD8<64, CD8VF>; } +} multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86SchedWriteMoveLS Sched, PatFrag st_frag = alignednontemporalstore> { - let SchedRW = [Sched.MR], AddedComplexity = 400 in + let mayStore = 1, SchedRW = [Sched.MR], AddedComplexity = 400 in def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(st_frag (_.VT _.RC:$src), addr:$dst)], @@ -4682,6 +4695,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, IsCommutable, IsCommutable>, AVX512BIBase, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4694,6 +4708,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, bit IsCommutable = 0> : avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> { + let mayLoad = 1, hasSideEffects = 0 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"#_.BroadcastStr#", $src1", @@ -4811,6 +4826,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, (_Src.VT _Src.RC:$src2))), IsCommutable>, AVX512BIBase, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in { defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4828,6 +4844,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>, AVX512BIBase, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, @@ -4893,6 +4910,7 @@ defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU, multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _Src, X86VectorVTInfo _Dst, X86FoldableSchedWrite sched> { + let mayLoad = 1, hasSideEffects = 0 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, @@ -4916,6 +4934,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, (_Src.VT _Src.RC:$src2))), IsCommutable, IsCommutable>, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -5370,6 +5389,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -5384,6 +5404,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, Sched<[sched]> { let isCommutable = IsCommutable; } + let mayLoad = 1 in def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -5414,6 +5435,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">, Sched<[sched]>, SIMD_EXC; + let mayLoad = 1 in defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -5430,6 +5452,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, Sched<[sched]> { let isCommutable = IsCommutable; } + let mayLoad = 1 in def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -5509,6 +5532,7 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, Sched<[sched]> { let isCommutable = 1; } + let mayLoad = 1 in def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -5737,6 +5761,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1 in { defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", @@ -5749,6 +5774,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } + } } multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -5759,6 +5785,7 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, Sched<[sched]>; + let mayLoad = 1 in defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", @@ -5916,6 +5943,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>, Sched<[sched]>; + let mayLoad = 1 in defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -5928,7 +5956,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, mayLoad = 1 in defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, ${src1}"#_.BroadcastStr, "${src1}"#_.BroadcastStr#", $src2", @@ -5946,6 +5974,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>, AVX512BIBase, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, i128mem:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -6095,6 +6124,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>, AVX5128IBase, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -6107,7 +6137,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"#_.BroadcastStr#", $src1", @@ -6372,6 +6402,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (Ctrl.VT Ctrl.RC:$src2)))>, T8, PD, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1 in { defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -6389,6 +6420,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>, T8, PD, EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar, @@ -7258,6 +7290,7 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses, (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>, EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>; + let mayLoad = 1 in def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins DstVT.RC:$src1, x86memop:$src2), asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -7400,6 +7433,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>, EVEX, VEX_LIG, EVEX_B, EVEX_RC, Sched<[sched]>; + let mayLoad = 1 in def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode @@ -7451,6 +7485,7 @@ multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT, !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode SrcVT.FRC:$src))]>, EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC; + let mayLoad = 1 in def rm : AVX512<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode (SrcVT.ScalarLdFrag addr:$src)))]>, @@ -7572,6 +7607,7 @@ let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in { !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC; + let mayLoad = 1 in def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, @@ -7587,6 +7623,7 @@ let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in { !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>, EVEX, VEX_LIG, EVEX_B, Sched<[sched]>; + let mayLoad = 1 in def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), @@ -7644,6 +7681,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _ (_.VT (OpNode (_.VT _.RC:$src1), (_Src.VT _Src.RC:$src2))), "_Int">, EVEX, VVVV, VEX_LIG, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -7807,6 +7845,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in { _.ImmAllZerosV)>, EVEX, Sched<[sched]>; + let mayLoad = 1 in { defm rm : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins MemOp:$src), (ins _.RC:$src0, MaskRC:$mask, MemOp:$src), @@ -7840,6 +7879,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in { _.ImmAllZerosV)>, EVEX, EVEX_B, Sched<[sched.Folded]>; } + } } // Conversion with SAE - suppress all exceptions multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, @@ -8944,6 +8984,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, (X86any_cvtph2ps (_src.VT _src.RC:$src)), (X86cvtph2ps (_src.VT _src.RC:$src))>, T8, PD, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), "vcvtph2ps", "$src", "$src", (X86any_cvtph2ps (_src.VT ld_dag)), @@ -9161,6 +9202,7 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX, VVVV, VEX_LIG, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -9621,6 +9663,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, (i32 timm:$src3))), "_Int">, EVEX_B, Sched<[sched]>; + let mayLoad = 1 in defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, @@ -9999,6 +10042,7 @@ multiclass avx512_pmovx_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWr (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>, EVEX, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), (ins x86memop:$src), OpcodeStr ,"$src", "$src", (DestInfo.VT (LdFrag addr:$src))>, @@ -10601,6 +10645,7 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, (null_frag)>, AVX5128IBase, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", (null_frag)>, @@ -10673,6 +10718,7 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, (OpNode (_.VT _.RC:$src1), (i32 timm:$src2)), (MaskOpNode (_.VT _.RC:$src1), (i32 timm:$src2))>, Sched<[sched]>; + let mayLoad = 1 in { defm rmi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", @@ -10691,6 +10737,7 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, (i32 timm:$src2))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} @@ -10739,6 +10786,7 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (i32 timm:$src3))>, Sched<[sched]>; + let mayLoad = 1 in { defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -10755,6 +10803,7 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, (i32 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) @@ -10770,6 +10819,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (SrcInfo.VT SrcInfo.RC:$src2), (i8 timm:$src3)))>, Sched<[sched]>; + let mayLoad = 1 in defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -10788,7 +10838,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _>: avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{ - let ExeDomain = _.ExeDomain, ImmT = Imm8 in + let ExeDomain = _.ExeDomain, ImmT = Imm8, mayLoad = 1 in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1", @@ -10811,6 +10861,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (i32 timm:$src3))>, Sched<[sched]>; + let mayLoad = 1 in defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -10979,6 +11030,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))))>, Sched<[sched]>; + let mayLoad = 1 in { defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -11000,6 +11052,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (i8 timm:$src3)))))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } + } } multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched, @@ -11031,6 +11084,7 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>, Sched<[sched]>; + let mayLoad = 1 in { defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -11048,6 +11102,7 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr, (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } + } } multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched, @@ -11202,6 +11257,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", @@ -11214,6 +11270,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> : avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> { + let mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"#_.BroadcastStr, @@ -11368,6 +11425,7 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, (ins _.RC:$src), OpcodeStr, "$src", "$src", (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", (_.VT (_.BroadcastLdFrag addr:$src))>, @@ -11513,6 +11571,7 @@ defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, REX_W; multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, PatFrag LdFrag, SDPatternOperator immoperator> { + let mayLoad = 1 in def rmi : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -11650,6 +11709,7 @@ multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, (OpNode (_src.VT _src.RC:$src1), (_src.VT _src.RC:$src2))))]>, Sched<[sched]>; + let mayLoad = 1 in def rm : AVX512BI<opc, MRMSrcMem, (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -11751,6 +11811,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src3), (i8 timm:$src4)), 1, 1>, AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>; + let mayLoad = 1 in { defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -11770,6 +11831,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (i8 timm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; + } }// Constraints = "$src1 = $dst" // Additional patterns for matching passthru operand in other positions. @@ -12016,6 +12078,7 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, (_.VT _.RC:$src2), (TblVT.VT _.RC:$src3), (i32 timm:$src4))>, Sched<[sched]>; + let mayLoad = 1 in { defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4), OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -12033,6 +12096,7 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)), (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } // Constraints = "$src1 = $dst" } @@ -12075,6 +12139,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (_src3VT.VT _src3VT.RC:$src3), (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + let mayLoad = 1 in defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -12417,6 +12482,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, VTI.RC:$src2, VTI.RC:$src3)), IsCommutable, IsCommutable>, EVEX, VVVV, T8, Sched<[sched]>; + let mayLoad = 1 in { defm rm : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, "$src3, $src2", "$src2, $src3", @@ -12435,6 +12501,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, T8, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; } + } } multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, @@ -12508,6 +12575,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2))>, EVEX, VVVV, T8, PD, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst), (ins VTI.RC:$src1, VTI.MemOp:$src2), "vpshufbitqmb", @@ -12557,7 +12625,7 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo VTI, X86VectorVTInfo BcstVTI> : avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> { - let ExeDomain = VTI.ExeDomain in + let ExeDomain = VTI.ExeDomain, mayLoad = 1 in defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src1, BcstVTI.ScalarMemOp:$src2, u8imm:$src3), OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1", @@ -12660,6 +12728,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf _.RC:$src1, (_.VT _.RC:$src2)))]>, EVEX, VVVV, T8, XD, Sched<[sched]>; + let mayLoad = 1 in { def rm : I<0x68, MRMSrcMem, (outs _.KRPC:$dst), (ins _.RC:$src1, _.MemOp:$src2), @@ -12679,6 +12748,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>, EVEX, VVVV, T8, XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } multiclass avx512_vp2intersect<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { @@ -12882,6 +12952,7 @@ let Predicates = [HasFP16] in { // Move word ( r/m16) to Packed word def VMOVW2SHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, Sched<[WriteVecMoveFromGpr]>; +let mayLoad = 1 in def VMOVWrm : AVX512<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i16mem:$src), "vmovw\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, @@ -13607,6 +13678,7 @@ multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNod (v4f32 (OpNode VR128X:$src1, VR128X:$src2)), IsCommutable, IsCommutable, IsCommutable, X86selects, "@earlyclobber $dst">, Sched<[WriteFMAX]>; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, f32x_info, (outs VR128X:$dst), (ins VR128X:$src1, ssmem:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", |