diff options
Diffstat (limited to 'llvm/lib/Target')
28 files changed, 911 insertions, 93 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6965116..9926a4d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26196,9 +26196,10 @@ static SDValue performFlagSettingCombine(SDNode *N, return DCI.CombineTo(N, Res, SDValue(N, 1)); } - // Combine identical generic nodes into this node, re-using the result. + // Combine equivalent generic nodes into this node, re-using the result. if (SDNode *Generic = DCI.DAG.getNodeIfExists( - GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS})) + GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}, + /*AllowCommute=*/true)) DCI.CombineTo(Generic, SDValue(N, 0)); return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index dbe74b1..5700468 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || TII->isTRANS(MI))) - Result = true; + Result = !MI.mayLoadOrStore(); else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && - TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) - Result = true; + TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) { + // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS). + // For our purposes, these shall not be classified as VALU as this results + // in unexpected behavior. + Result = !MI.mayLoadOrStore(); + } else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && TII->isSALU(MI)) - Result = true; + Result = !MI.mayLoadOrStore(); else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && TII->isMFMAorWMMA(MI)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a44af5f..1b559a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2833,8 +2833,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, R = getMad(DAG, DL, VT, YH, CH, Mad1); } - const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && - (Flags.hasNoInfs() || Options.NoInfsFPMath); + const bool IsFiniteOnly = + (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs(); // TODO: Check if known finite from source value. if (!IsFiniteOnly) { @@ -3161,9 +3161,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT); R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R); - const auto &Options = getTargetMachine().Options; - if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) { + if (!Flags.hasNoInfs()) { SDValue OverflowCheckConst = DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT); SDValue Overflow = diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index ee466ca..596a895 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3575,7 +3575,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, const bool IsFiniteOnly = (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && - (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); + MI.getFlag(MachineInstr::FmNoInfs); if (!IsFiniteOnly) { // Expand isfinite(x) => fabs(x) < inf @@ -3864,9 +3864,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, R = B.buildSelect(Ty, Underflow, Zero, R); - const auto &Options = MF.getTarget().Options; - - if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { + if (!(Flags & MachineInstr::FmNoInfs)) { auto OverflowCheckConst = B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 71494be..4e11c4f 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -14,6 +14,7 @@ #include "GCNRegPressure.h" #include "AMDGPU.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/RegisterPressure.h" using namespace llvm; @@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI, + GCNRegPressure::RegKind RegKind) { GCNRPTracker::LiveRegSet LiveRegs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { auto Reg = Register::index2VirtReg(I); + if (RegKind != GCNRegPressure::TOTAL_KINDS && + GCNRegPressure::getRegKind(Reg, MRI) != RegKind) + continue; if (!LIS.hasInterval(Reg)) continue; auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); @@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { #undef PFX } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF, + GCNRegPressure::RegKind Kind, + LiveIntervals &LIS, + const MachineLoopInfo *MLI) { + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + auto &OS = dbgs(); + const char *RegName = GCNRegPressure::getName(Kind); + + unsigned MaxNumRegs = 0; + const MachineInstr *MaxPressureMI = nullptr; + GCNUpwardRPTracker RPT(LIS); + for (const MachineBasicBlock &MBB : MF) { + RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot()); + for (const MachineInstr &MI : reverse(MBB)) { + RPT.recede(MI); + unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind); + if (NumRegs > MaxNumRegs) { + MaxNumRegs = NumRegs; + MaxPressureMI = &MI; + } + } + } + + SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI); + + // Max pressure can occur at either the early-clobber or register slot. + // Choose the maximum liveset between both slots. This is ugly but this is + // diagnostic code. + SlotIndex ECSlot = MISlot.getRegSlot(true); + SlotIndex RSlot = MISlot.getRegSlot(false); + GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind); + GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind); + unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind); + unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind); + GCNRPTracker::LiveRegSet *LiveSet = + ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet; + SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot; + assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs); + + // Split live registers into single-def and multi-def sets. + GCNRegPressure SDefPressure, MDefPressure; + SmallVector<Register, 16> SDefRegs, MDefRegs; + for (auto [Reg, LaneMask] : *LiveSet) { + assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind); + LiveInterval &LI = LIS.getInterval(Reg); + if (LI.getNumValNums() == 1 || + (LI.hasSubRanges() && + llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) { + return SR.getNumValNums() == 1; + }))) { + SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI); + SDefRegs.push_back(Reg); + } else { + MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI); + MDefRegs.push_back(Reg); + } + } + unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind); + unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind); + assert(SDefNumRegs + MDefNumRegs == MaxNumRegs); + + auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) { + return Printable([&, MBB, SI](raw_ostream &OS) { + OS << SI << ':' << printMBBReference(*MBB); + if (MLI) + if (const MachineLoop *ML = MLI->getLoopFor(MBB)) + OS << " (LoopHdr " << printMBBReference(*ML->getHeader()) + << ", Depth " << ML->getLoopDepth() << ")"; + }); + }; + + auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) { + GCNRegPressure RegPressure; + RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI); + OS << " " << printReg(Reg, TRI) << ':' + << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask " + << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' ' + << RegName << "s)\n"; + + // Use std::map to sort def/uses by SlotIndex. + std::map<SlotIndex, const MachineInstr *> Instrs; + for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) { + Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI; + } + + for (const auto &[SI, MI] : Instrs) { + OS << " "; + if (MI->definesRegister(Reg, TRI)) + OS << "def "; + if (MI->readsRegister(Reg, TRI)) + OS << "use "; + OS << printLoc(MI->getParent(), SI) << ": " << *MI; + } + }; + + OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName() + << " ***\n"; + OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at " + << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": " + << *MaxPressureMI; + + OS << "\nLive registers with single definition (" << SDefNumRegs << ' ' + << RegName << "s):\n"; + + // Sort SDefRegs by number of uses (smallest first) + llvm::sort(SDefRegs, [&](Register A, Register B) { + return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) < + std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end()); + }); + + for (const Register Reg : SDefRegs) { + PrintRegInfo(Reg, LiveSet->lookup(Reg)); + } + + OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' ' + << RegName << "s):\n"; + for (const Register Reg : MDefRegs) { + PrintRegInfo(Reg, LiveSet->lookup(Reg)); + } +} +#endif diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 898d1ff..979a8b0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -31,6 +31,12 @@ class SlotIndex; struct GCNRegPressure { enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS }; + static constexpr const char *getName(RegKind Kind) { + const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"}; + assert(Kind < TOTAL_KINDS); + return Names[Kind]; + } + GCNRegPressure() { clear(); } @@ -41,6 +47,11 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); } + unsigned getNumRegs(RegKind Kind) const { + assert(Kind < TOTAL_KINDS); + return Value[Kind]; + } + /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR]; } /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure @@ -138,6 +149,12 @@ struct GCNRegPressure { void dump() const; + static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI); + return (RegKind)getRegKind(MRI.getRegClass(Reg), STI); + } + private: static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2; @@ -294,8 +311,10 @@ public: } }; -GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI); +GCNRPTracker::LiveRegSet +getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, + GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS); //////////////////////////////////////////////////////////////////////////////// // GCNUpwardRPTracker @@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, const MachineRegisterInfo &MRI, LaneBitmask LaneMaskFilter = LaneBitmask::getAll()); -GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI); - /// creates a map MachineInstr -> LiveRegSet /// R - range of iterators on instructions /// After - upon entry or exit of every instruction @@ -524,6 +540,11 @@ public: } }; +LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, + GCNRegPressure::RegKind Kind, + LiveIntervals &LIS, + const MachineLoopInfo *MLI); + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index bdc0810..58482ea 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers( cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false)); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +#define DUMP_MAX_REG_PRESSURE +static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler( + "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, + cl::desc("Print a list of live registers along with their def/uses at the " + "point of maximum register pressure before scheduling."), + cl::init(false)); + +static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler( + "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, + cl::desc("Print a list of live registers along with their def/uses at the " + "point of maximum register pressure after scheduling."), + cl::init(false)); +#endif + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() { RegionLiveOuts.buildLiveRegMap(); } +#ifdef DUMP_MAX_REG_PRESSURE + if (PrintMaxRPRegUsageBeforeScheduler) { + dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); + dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); + LIS->dump(); + } +#endif + GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); while (S.advanceStage()) { auto Stage = createSchedStage(S.getCurrentStage()); @@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() { Stage->finalizeGCNSchedStage(); } + +#ifdef DUMP_MAX_REG_PRESSURE + if (PrintMaxRPRegUsageAfterScheduler) { + dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); + dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); + LIS->dump(); + } +#endif } #ifndef NDEBUG diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 64e34db..5f6d742 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -260,8 +260,12 @@ class NSAHelper { } class MIMGNSAHelper<int num_addrs, - list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> - : NSAHelper<> { + list<RegisterOperand> addr_types_in=[]> + : NSAHelper<> { + list<RegisterOperand> addr_types = + !if(!empty(addr_types_in), !listsplat(VGPROp_32, num_addrs), + addr_types_in); + list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i); let AddrIns = !dag(ins, addr_types, AddrAsmNames); let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]"; @@ -358,7 +362,7 @@ class MIMG_gfx11<int op, dag outs, string dns = ""> // Base class for all NSA MIMG instructions. // Note that 1-dword addresses always use non-NSA variants. class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="", - list<RegisterClass> addr_types=[], + list<RegisterOperand> addr_types=[], RegisterOperand LastAddrRC = VGPROp_32> : MIMG<outs, dns>, MIMGe_gfx11<op> { let SubtargetPredicate = isGFX11Only; @@ -378,7 +382,7 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="", } class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="", - list<RegisterClass> addr_types=[]> + list<RegisterOperand> addr_types=[]> : VIMAGE<outs, dns>, VIMAGEe<op> { let SubtargetPredicate = isGFX12Plus; let AssemblerPredicate = isGFX12Plus; @@ -1521,12 +1525,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> { int VAddrDwords = !srl(Size, 5); int GFX11PlusNSAAddrs = !if(IsA16, 4, 5); - RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32); - list<RegisterClass> GFX11PlusAddrTypes = - !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32], - isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64], - IsA16 : [node_ptr_type, VGPR_32, VReg_96, VReg_96], - true : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); + RegisterOperand node_ptr_type = !if(Is64, VGPROp_64, VGPROp_32); + list<RegisterOperand> GFX11PlusAddrTypes = + !cond(isBVH8 : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_32], + isDual : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_64], + IsA16 : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96], + true : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96, VGPROp_96]); } class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC> @@ -1552,7 +1556,7 @@ class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC> } class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs, - list<RegisterClass> addr_types> + list<RegisterOperand> addr_types> : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11", addr_types> { let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16)); @@ -1561,7 +1565,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs, class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs, bit isDual, bit isBVH8, - list<RegisterClass> addr_types> + list<RegisterOperand> addr_types> : VIMAGE_gfx12<op.GFX12, !if(!or(isDual, isBVH8), (outs VReg_320:$vdata, VReg_96:$ray_origin_out, VReg_96:$ray_dir_out), diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ec5c5bb3..a44a247 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -899,7 +899,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (DestReg == AMDGPU::VCC) { - if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + if (AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) .addReg(SrcReg, getKillRegState(KillSrc)); } else { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index bef4868..7e7ee75 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -280,6 +280,10 @@ static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) { } void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) { + if (!Subtarget->hasTcgen05InstSupport()) + report_fatal_error( + "tcgen05.ld is not supported on this architecture variant"); + SDLoc DL(N); unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); @@ -2136,6 +2140,10 @@ static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) { } void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) { + if (!Subtarget->hasTcgen05InstSupport()) + report_fatal_error( + "tcgen05.st is not supported on this architecture variant"); + SDLoc DL(N); unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 6c14cf0..dfde0cc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -101,6 +101,22 @@ def PrmtMode : Operand<i32> { // NVPTX Instruction Predicate Definitions //===----------------------------------------------------------------------===// +// Checks PTX version and family-specific and architecture-specific SM versions. +// For example, sm_100{f/a} and any future variants in the same family will match +// for any PTX version greater than or equal to `PTXVersion`. +class PTXWithFamilySMs<int PTXVersion, list<int> SMVersions> : + Predicate<"Subtarget->hasPTXWithFamilySMs(" # PTXVersion # ", {" # + !interleave(SMVersions, ", ") # "})">; + +// Checks PTX version and architecture-specific SM versions. +// For example, sm_100{a} will match for any PTX version +// greater than or equal to `PTXVersion`. +class PTXWithAccelSMs<int PTXVersion, list<int> SMVersions> : + Predicate<"Subtarget->hasPTXWithAccelSMs(" # PTXVersion # ", {" # + !interleave(SMVersions, ", ") # "})">; + +// Helper predicate to call a subtarget method. +class callSubtarget<string SubtargetMethod> : Predicate<"Subtarget->" # SubtargetMethod # "()">; def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index a8b854f..22cf3a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -5103,8 +5103,8 @@ let Predicates = [hasSM<90>, hasPTX<78>] in { def EXIT : NullaryInst<"exit", int_nvvm_exit>; // Tcgen05 intrinsics -let isConvergent = true, Predicates = [hasTcgen05Instructions] in { - +let isConvergent = true in { +let Predicates = [callSubtarget<"hasTcgen05InstSupport">] in { multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> { def "" : BasicNVPTXInst<(outs), (ins ADDR:$dst, B32:$ncols), @@ -5156,15 +5156,6 @@ defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">; defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">; defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">; -multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> { - def "" : BasicNVPTXInst<(outs), - (ins ADDR:$tmem_addr), - "tcgen05.shift.cta_group::" # num # ".down", - [(Intr addr:$tmem_addr)]>; -} -defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; -defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; - multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> { defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16"); defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret; @@ -5195,9 +5186,22 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">; defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">; } +} // Predicates + +let Predicates = [callSubtarget<"hasTcgen05ShiftSupport">] in { +multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> { + def "" : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr), + "tcgen05.shift.cta_group::" # num # ".down", + [(Intr addr:$tmem_addr)]>; +} +defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; +defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; +} // Predicates + } // isConvergent -let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in { +let hasSideEffects = 1, Predicates = [callSubtarget<"hasTcgen05InstSupport">] in { def tcgen05_fence_before_thread_sync: NullaryInst< "tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>; @@ -5231,8 +5235,7 @@ class TCGEN05_LDST_REGINFO<int Veclen> { // class TCGEN05_LD_INST<string Shape, int Num, bit Pack> : - NVPTXInst<(outs), (ins), "?", []>, - Requires<[hasTcgen05Instructions]> { + NVPTXInst<(outs), (ins), "?", []> { TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO< NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>; @@ -5256,8 +5259,7 @@ class TCGEN05_LD_INST<string Shape, int Num, bit Pack> : // class TCGEN05_ST_INST<string Shape, int Num, bit Unpack> : - NVPTXInst<(outs), (ins), "?", []>, - Requires<[hasTcgen05Instructions]> { + NVPTXInst<(outs), (ins), "?", []> { TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO< NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index c548967..989be50 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -72,6 +72,40 @@ const SelectionDAGTargetInfo *NVPTXSubtarget::getSelectionDAGInfo() const { return TSInfo.get(); } +bool NVPTXSubtarget::hasPTXWithFamilySMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const { + unsigned PTXVer = getPTXVersion(); + if (!hasFamilySpecificFeatures() || PTXVer < PTXVersion) + return false; + + unsigned SMVer = getSmVersion(); + return llvm::any_of(SMVersions, [&](unsigned SM) { + // sm_101 is a different family, never group it with sm_10x. + if (SMVer == 101 || SM == 101) + return SMVer == SM && + // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not + // supported. + !(PTXVer >= 90 && SMVer == 101); + + return getSmFamilyVersion() == SM / 10 && SMVer >= SM; + }); +} + +bool NVPTXSubtarget::hasPTXWithAccelSMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const { + unsigned PTXVer = getPTXVersion(); + if (!hasArchAccelFeatures() || PTXVer < PTXVersion) + return false; + + unsigned SMVer = getSmVersion(); + return llvm::any_of(SMVersions, [&](unsigned SM) { + return SMVer == SM && + // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not + // supported. + !(PTXVer >= 90 && SMVer == 101); + }); +} + bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index e81c56b..194dbdc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -73,6 +73,18 @@ public: const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; + // Checks PTX version and family-specific and architecture-specific SM + // versions. For example, sm_100{f/a} and any future variants in the same + // family will match for any PTX version greater than or equal to + // `PTXVersion`. + bool hasPTXWithFamilySMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const; + // Checks PTX version and architecture-specific SM versions. + // For example, sm_100{a} will match for any PTX version greater than or equal + // to `PTXVersion`. + bool hasPTXWithAccelSMs(unsigned PTXVersion, + ArrayRef<unsigned> SMVersions) const; + bool has256BitVectorLoadStore(unsigned AS) const { return SmVersion >= 100 && PTXVersion >= 88 && AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; @@ -127,6 +139,27 @@ public: return HasTcgen05 && PTXVersion >= MinPTXVersion; } + // Checks following instructions support: + // - tcgen05.ld/st + // - tcgen05.alloc/dealloc/relinquish + // - tcgen05.cp + // - tcgen05.fence/wait + // - tcgen05.commit + bool hasTcgen05InstSupport() const { + // sm_101 renamed to sm_110 in PTX 9.0 + return hasPTXWithFamilySMs(90, {100, 110}) || + hasPTXWithFamilySMs(88, {100, 101}) || + hasPTXWithAccelSMs(86, {100, 101}); + } + + // Checks tcgen05.shift instruction support. + bool hasTcgen05ShiftSupport() const { + // sm_101 renamed to sm_110 in PTX 9.0 + return hasPTXWithAccelSMs(90, {100, 110, 103}) || + hasPTXWithAccelSMs(88, {100, 101, 103}) || + hasPTXWithAccelSMs(86, {100, 101}); + } + bool hasTcgen05MMAScaleInputDImm() const { return FullSmVersion == 1003 && PTXVersion >= 86; } @@ -158,6 +191,7 @@ public: bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } + unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; } // GPUs with "a" suffix have architecture-accelerated features that are // supported on the specified architecture only, hence such targets do not // follow the onion layer model. hasArchAccelFeatures() allows distinguishing diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 4b54231..8851a0f 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1659,6 +1659,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, -1, (1 << 5) - 1, "immediate must be non-zero in the range"); + case Match_InvalidXSfmmVType: { + SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); + return generateXSfmmVTypeError(ErrorLoc); + } case Match_InvalidVTypeI: { SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); return generateVTypeError(ErrorLoc); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 70b7c43..e75dfe3 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -142,6 +142,22 @@ enum { ReadsPastVLShift = DestEEWShift + 2, ReadsPastVLMask = 1ULL << ReadsPastVLShift, + + // 0 -> Don't care about altfmt bit in VTYPE. + // 1 -> Is not altfmt. + // 2 -> Is altfmt(BF16). + AltFmtTypeShift = ReadsPastVLShift + 1, + AltFmtTypeMask = 3ULL << AltFmtTypeShift, + + // XSfmmbase + HasTWidenOpShift = AltFmtTypeShift + 2, + HasTWidenOpMask = 1ULL << HasTWidenOpShift, + + HasTMOpShift = HasTWidenOpShift + 1, + HasTMOpMask = 1ULL << HasTMOpShift, + + HasTKOpShift = HasTMOpShift + 1, + HasTKOpMask = 1ULL << HasTKOpShift, }; // Helper functions to read TSFlags. @@ -183,6 +199,11 @@ static inline bool hasRoundModeOp(uint64_t TSFlags) { return TSFlags & HasRoundModeOpMask; } +enum class AltFmtType { DontCare, NotAltFmt, AltFmt }; +static inline AltFmtType getAltFmtType(uint64_t TSFlags) { + return static_cast<AltFmtType>((TSFlags & AltFmtTypeMask) >> AltFmtTypeShift); +} + /// \returns true if this instruction uses vxrm static inline bool usesVXRM(uint64_t TSFlags) { return TSFlags & UsesVXRMMask; } @@ -204,11 +225,47 @@ static inline bool readsPastVL(uint64_t TSFlags) { return TSFlags & ReadsPastVLMask; } +// XSfmmbase +static inline bool hasTWidenOp(uint64_t TSFlags) { + return TSFlags & HasTWidenOpMask; +} + +static inline bool hasTMOp(uint64_t TSFlags) { return TSFlags & HasTMOpMask; } + +static inline bool hasTKOp(uint64_t TSFlags) { return TSFlags & HasTKOpMask; } + +static inline unsigned getTNOpNum(const MCInstrDesc &Desc) { + const uint64_t TSFlags = Desc.TSFlags; + assert(hasTWidenOp(TSFlags) && hasVLOp(TSFlags)); + unsigned Offset = 3; + if (hasTKOp(TSFlags)) + Offset = 4; + return Desc.getNumOperands() - Offset; +} + +static inline unsigned getTMOpNum(const MCInstrDesc &Desc) { + const uint64_t TSFlags = Desc.TSFlags; + assert(hasTWidenOp(TSFlags) && hasTMOp(TSFlags)); + if (hasTKOp(TSFlags)) + return Desc.getNumOperands() - 5; + // vtzero.t + return Desc.getNumOperands() - 4; +} + +static inline unsigned getTKOpNum(const MCInstrDesc &Desc) { + [[maybe_unused]] const uint64_t TSFlags = Desc.TSFlags; + assert(hasTWidenOp(TSFlags) && hasTKOp(TSFlags)); + return Desc.getNumOperands() - 3; +} + static inline unsigned getVLOpNum(const MCInstrDesc &Desc) { const uint64_t TSFlags = Desc.TSFlags; // This method is only called if we expect to have a VL operand, and all // instructions with VL also have SEW. assert(hasSEWOp(TSFlags) && hasVLOp(TSFlags)); + // In Xsfmmbase, TN is an alias for VL, so here we use the same TSFlags bit. + if (hasTWidenOp(TSFlags)) + return getTNOpNum(Desc); unsigned Offset = 2; if (hasVecPolicyOp(TSFlags)) Offset = 3; @@ -226,7 +283,7 @@ static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) { const uint64_t TSFlags = Desc.TSFlags; assert(hasSEWOp(TSFlags)); unsigned Offset = 1; - if (hasVecPolicyOp(TSFlags)) + if (hasVecPolicyOp(TSFlags) || hasTWidenOp(TSFlags)) Offset = 2; return Desc.getNumOperands() - Offset; } @@ -243,6 +300,9 @@ static inline int getFRMOpNum(const MCInstrDesc &Desc) { if (!hasRoundModeOp(TSFlags) || usesVXRM(TSFlags)) return -1; + if (hasTWidenOp(TSFlags) && hasTMOp(TSFlags)) + return getTMOpNum(Desc) - 1; + // The operand order // -------------------------------------- // | n-1 (if any) | n-2 | n-3 | n-4 | @@ -385,7 +445,9 @@ enum OperandType : unsigned { OPERAND_SEW_MASK, // Vector rounding mode for VXRM or FRM. OPERAND_VEC_RM, - OPERAND_LAST_RISCV_IMM = OPERAND_VEC_RM, + // Vtype operand for XSfmm extension. + OPERAND_XSFMM_VTYPE, + OPERAND_LAST_RISCV_IMM = OPERAND_XSFMM_VTYPE, // Operand is either a register or uimm5, this is used by V extension pseudo // instructions to represent a value that be passed as AVL to either vsetvli // or vsetivli. diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 437022f..974252a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3353,14 +3353,20 @@ bool RISCVDAGToDAGISel::selectSETCC(SDValue N, ISD::CondCode ExpectedCCVal, 0); return true; } - // If the RHS is [-2047,2048], we can use addi with -RHS to produce 0 if the - // LHS is equal to the RHS and non-zero otherwise. + // If the RHS is [-2047,2048], we can use addi/addiw with -RHS to produce 0 + // if the LHS is equal to the RHS and non-zero otherwise. if (isInt<12>(CVal) || CVal == 2048) { - Val = SDValue( - CurDAG->getMachineNode( - RISCV::ADDI, DL, N->getValueType(0), LHS, - CurDAG->getSignedTargetConstant(-CVal, DL, N->getValueType(0))), - 0); + unsigned Opc = RISCV::ADDI; + if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG && + cast<VTSDNode>(LHS.getOperand(1))->getVT() == MVT::i32) { + Opc = RISCV::ADDIW; + LHS = LHS.getOperand(0); + } + + Val = SDValue(CurDAG->getMachineNode(Opc, DL, N->getValueType(0), LHS, + CurDAG->getSignedTargetConstant( + -CVal, DL, N->getValueType(0))), + 0); return true; } if (isPowerOf2_64(CVal) && Subtarget->hasStdExtZbs()) { diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index cf8d120..1b7cb9b 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -168,10 +168,13 @@ struct DemandedFields { // If this is true, we demand that VTYPE is set to some legal state, i.e. that // vill is unset. bool VILL = false; + bool TWiden = false; + bool AltFmt = false; // Return true if any part of VTYPE was used bool usedVTYPE() const { - return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL; + return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL || + TWiden || AltFmt; } // Return true if any property of VL was used @@ -187,6 +190,8 @@ struct DemandedFields { TailPolicy = true; MaskPolicy = true; VILL = true; + TWiden = true; + AltFmt = true; } // Mark all VL properties as demanded @@ -212,6 +217,8 @@ struct DemandedFields { TailPolicy |= B.TailPolicy; MaskPolicy |= B.MaskPolicy; VILL |= B.VILL; + AltFmt |= B.AltFmt; + TWiden |= B.TWiden; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -258,7 +265,9 @@ struct DemandedFields { OS << "SEWLMULRatio=" << SEWLMULRatio << ", "; OS << "TailPolicy=" << TailPolicy << ", "; OS << "MaskPolicy=" << MaskPolicy << ", "; - OS << "VILL=" << VILL; + OS << "VILL=" << VILL << ", "; + OS << "AltFmt=" << AltFmt << ", "; + OS << "TWiden=" << TWiden; OS << "}"; } #endif @@ -328,6 +337,15 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType, if (Used.MaskPolicy && RISCVVType::isMaskAgnostic(CurVType) != RISCVVType::isMaskAgnostic(NewVType)) return false; + if (Used.TWiden && (RISCVVType::hasXSfmmWiden(CurVType) != + RISCVVType::hasXSfmmWiden(NewVType) || + (RISCVVType::hasXSfmmWiden(CurVType) && + RISCVVType::getXSfmmWiden(CurVType) != + RISCVVType::getXSfmmWiden(NewVType)))) + return false; + if (Used.AltFmt && + RISCVVType::isAltFmt(CurVType) != RISCVVType::isAltFmt(NewVType)) + return false; return true; } @@ -479,6 +497,11 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) { Res.TailPolicy = false; } + Res.AltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) != + RISCVII::AltFmtType::DontCare; + Res.TWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) || + RISCVInstrInfo::isXSfmmVectorConfigInstr(MI); + return Res; } @@ -510,6 +533,8 @@ class VSETVLIInfo { uint8_t TailAgnostic : 1; uint8_t MaskAgnostic : 1; uint8_t SEWLMULRatioOnly : 1; + uint8_t AltFmt : 1; + uint8_t TWiden : 3; public: VSETVLIInfo() @@ -586,6 +611,8 @@ public: RISCVVType::VLMUL getVLMUL() const { return VLMul; } bool getTailAgnostic() const { return TailAgnostic; } bool getMaskAgnostic() const { return MaskAgnostic; } + bool getAltFmt() const { return AltFmt; } + unsigned getTWiden() const { return TWiden; } bool hasNonZeroAVL(const LiveIntervals *LIS) const { if (hasAVLImm()) @@ -647,21 +674,31 @@ public: SEW = RISCVVType::getSEW(VType); TailAgnostic = RISCVVType::isTailAgnostic(VType); MaskAgnostic = RISCVVType::isMaskAgnostic(VType); + AltFmt = RISCVVType::isAltFmt(VType); + TWiden = + RISCVVType::hasXSfmmWiden(VType) ? RISCVVType::getXSfmmWiden(VType) : 0; } - void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA) { + void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA, bool Altfmt, + unsigned W) { assert(isValid() && !isUnknown() && "Can't set VTYPE for uninitialized or unknown"); VLMul = L; SEW = S; TailAgnostic = TA; MaskAgnostic = MA; + AltFmt = Altfmt; + TWiden = W; } + void setAltFmt(bool AF) { AltFmt = AF; } + void setVLMul(RISCVVType::VLMUL VLMul) { this->VLMul = VLMul; } unsigned encodeVTYPE() const { assert(isValid() && !isUnknown() && !SEWLMULRatioOnly && "Can't encode VTYPE for uninitialized or unknown"); + if (TWiden != 0) + return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt); return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic); } @@ -674,9 +711,9 @@ public: "Can't compare VTYPE in unknown state"); assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly && "Can't compare when only LMUL/SEW ratio is valid."); - return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic) == + return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden) == std::tie(Other.VLMul, Other.SEW, Other.TailAgnostic, - Other.MaskAgnostic); + Other.MaskAgnostic, Other.AltFmt, Other.TWiden); } unsigned getSEWLMULRatio() const { @@ -825,7 +862,9 @@ public: << "SEW=e" << (unsigned)SEW << ", " << "TailAgnostic=" << (bool)TailAgnostic << ", " << "MaskAgnostic=" << (bool)MaskAgnostic << ", " - << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << "}"; + << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", " + << "TWiden=" << (unsigned)TWiden << ", " + << "AltFmt=" << (bool)AltFmt << "}"; } #endif }; @@ -853,6 +892,11 @@ struct BlockData { BlockData() = default; }; +enum TKTMMode { + VSETTK = 0, + VSETTM = 1, +}; + class RISCVInsertVSETVLI : public MachineFunctionPass { const RISCVSubtarget *ST; const TargetInstrInfo *TII; @@ -908,6 +952,7 @@ private: VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const; VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const; void forwardVSETVLIAVL(VSETVLIInfo &Info) const; + bool insertVSETMTK(MachineBasicBlock &MBB, TKTMMode Mode) const; }; } // end anonymous namespace @@ -945,6 +990,18 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const { VSETVLIInfo NewInfo; if (MI.getOpcode() == RISCV::PseudoVSETIVLI) { NewInfo.setAVLImm(MI.getOperand(1).getImm()); + } else if (RISCVInstrInfo::isXSfmmVectorConfigTNInstr(MI)) { + assert(MI.getOpcode() == RISCV::PseudoSF_VSETTNT || + MI.getOpcode() == RISCV::PseudoSF_VSETTNTX0); + switch (MI.getOpcode()) { + case RISCV::PseudoSF_VSETTNTX0: + NewInfo.setAVLVLMAX(); + break; + case RISCV::PseudoSF_VSETTNT: + Register ATNReg = MI.getOperand(1).getReg(); + NewInfo.setAVLRegDef(getVNInfoFromReg(ATNReg, MI, LIS), ATNReg); + break; + } } else { assert(MI.getOpcode() == RISCV::PseudoVSETVLI || MI.getOpcode() == RISCV::PseudoVSETVLIX0); @@ -1005,11 +1062,34 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { RISCVVType::VLMUL VLMul = RISCVII::getLMul(TSFlags); + bool AltFmt = RISCVII::getAltFmtType(TSFlags) == RISCVII::AltFmtType::AltFmt; + InstrInfo.setAltFmt(AltFmt); + unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm(); // A Log2SEW of 0 is an operation on mask registers only. unsigned SEW = Log2SEW ? 1 << Log2SEW : 8; assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW"); + if (RISCVII::hasTWidenOp(TSFlags)) { + const MachineOperand &TWidenOp = + MI.getOperand(MI.getNumExplicitOperands() - 1); + unsigned TWiden = TWidenOp.getImm(); + + InstrInfo.setAVLVLMAX(); + if (RISCVII::hasVLOp(TSFlags)) { + const MachineOperand &TNOp = + MI.getOperand(RISCVII::getTNOpNum(MI.getDesc())); + + if (TNOp.getReg().isVirtual()) + InstrInfo.setAVLRegDef(getVNInfoFromReg(TNOp.getReg(), MI, LIS), + TNOp.getReg()); + } + + InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden); + + return InstrInfo; + } + if (RISCVII::hasVLOp(TSFlags)) { const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI)); if (VLOp.isImm()) { @@ -1045,7 +1125,9 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { assert(SEW == EEW && "Initial SEW doesn't match expected EEW"); } #endif - InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic); + // TODO: Propagate the twiden from previous vtype for potential reuse. + InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, + /*TWiden*/ 0); forwardVSETVLIAVL(InstrInfo); @@ -1053,10 +1135,33 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { } void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertPt, DebugLoc DL, - const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) { - + MachineBasicBlock::iterator InsertPt, + DebugLoc DL, const VSETVLIInfo &Info, + const VSETVLIInfo &PrevInfo) { ++NumInsertedVSETVL; + + if (Info.getTWiden()) { + if (Info.hasAVLVLMAX()) { + Register DestReg = MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass); + auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNTX0)) + .addReg(DestReg, RegState::Define | RegState::Dead) + .addReg(RISCV::X0, RegState::Kill) + .addImm(Info.encodeVTYPE()); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MI); + LIS->createAndComputeVirtRegInterval(DestReg); + } + } else { + auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNT)) + .addReg(RISCV::X0, RegState::Define | RegState::Dead) + .addReg(Info.getAVLReg()) + .addImm(Info.encodeVTYPE()); + if (LIS) + LIS->InsertMachineInstrInMaps(*MI); + } + return; + } + if (PrevInfo.isValid() && !PrevInfo.isUnknown()) { // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same // VLMAX. @@ -1198,7 +1303,8 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info, // be coalesced into another vsetvli since we won't demand any fields. VSETVLIInfo NewInfo; // Need a new VSETVLIInfo to clear SEWLMULRatioOnly NewInfo.setAVLImm(1); - NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true); + NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true, + /*AltFmt*/ false, /*W*/ 0); Info = NewInfo; return; } @@ -1240,7 +1346,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info, (Demanded.TailPolicy ? IncomingInfo : Info).getTailAgnostic() || IncomingInfo.getTailAgnostic(), (Demanded.MaskPolicy ? IncomingInfo : Info).getMaskAgnostic() || - IncomingInfo.getMaskAgnostic()); + IncomingInfo.getMaskAgnostic(), + (Demanded.AltFmt ? IncomingInfo : Info).getAltFmt(), + Demanded.TWiden ? IncomingInfo.getTWiden() : 0); // If we only knew the sew/lmul ratio previously, replace the VTYPE but keep // the AVL. @@ -1293,7 +1401,8 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB, if (RISCVInstrInfo::isVectorConfigInstr(MI) || RISCVII::hasSEWOp(MI.getDesc().TSFlags) || - isVectorCopy(ST->getRegisterInfo(), MI)) + isVectorCopy(ST->getRegisterInfo(), MI) || + RISCVInstrInfo::isXSfmmVectorConfigInstr(MI)) HadVectorOp = true; transferAfter(Info, MI); @@ -1675,6 +1784,12 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const { }; for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) { + // TODO: Support XSfmm. + if (RISCVII::hasTWidenOp(MI.getDesc().TSFlags) || + RISCVInstrInfo::isXSfmmVectorConfigInstr(MI)) { + NextMI = nullptr; + continue; + } if (!RISCVInstrInfo::isVectorConfigInstr(MI)) { Used.doUnion(getDemanded(MI, ST)); @@ -1788,6 +1903,65 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) { } } +bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB, + TKTMMode Mode) const { + + bool Changed = false; + for (auto &MI : MBB) { + uint64_t TSFlags = MI.getDesc().TSFlags; + if (RISCVInstrInfo::isXSfmmVectorConfigTMTKInstr(MI) || + !RISCVII::hasSEWOp(TSFlags) || !RISCVII::hasTWidenOp(TSFlags)) + continue; + + VSETVLIInfo CurrInfo = computeInfoForInstr(MI); + + if (Mode == VSETTK && !RISCVII::hasTKOp(TSFlags)) + continue; + + if (Mode == VSETTM && !RISCVII::hasTMOp(TSFlags)) + continue; + + unsigned OpNum = 0; + unsigned Opcode = 0; + switch (Mode) { + case VSETTK: + OpNum = RISCVII::getTKOpNum(MI.getDesc()); + Opcode = RISCV::PseudoSF_VSETTK; + break; + case VSETTM: + OpNum = RISCVII::getTMOpNum(MI.getDesc()); + Opcode = RISCV::PseudoSF_VSETTM; + break; + } + + assert(OpNum && Opcode && "Invalid OpNum or Opcode"); + + MachineOperand &Op = MI.getOperand(OpNum); + + auto TmpMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opcode)) + .addReg(RISCV::X0, RegState::Define | RegState::Dead) + .addReg(Op.getReg()) + .addImm(Log2_32(CurrInfo.getSEW())) + .addImm(Log2_32(CurrInfo.getTWiden()) + 1); + + Changed = true; + Register Reg = Op.getReg(); + Op.setReg(Register()); + Op.setIsKill(false); + if (LIS) { + LIS->InsertMachineInstrInMaps(*TmpMI); + LiveInterval &LI = LIS->getInterval(Reg); + + // Erase the AVL operand from the instruction. + LIS->shrinkToUses(&LI); + // TODO: Enable this once needVSETVLIPHI is supported. + // SmallVector<LiveInterval *> SplitLIs; + // LIS->splitSeparateComponents(LI, SplitLIs); + } + } + return Changed; +} + bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) { // Skip if the vector extension is not enabled. ST = &MF.getSubtarget<RISCVSubtarget>(); @@ -1865,6 +2039,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) insertReadVL(MBB); + for (MachineBasicBlock &MBB : MF) { + insertVSETMTK(MBB, VSETTM); + insertVSETMTK(MBB, VSETTK); + } + BlockInfo.clear(); return HaveVectorOp; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index 2afd77a..5b06303 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -267,6 +267,22 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr, // operands' VLs. bit ReadsPastVL = 0; let TSFlags{26} = ReadsPastVL; + + // 0 -> Don't care about altfmt bit in VTYPE. + // 1 -> Is not altfmt. + // 2 -> Is altfmt(BF16). + bits<2> AltFmtType = 0; + let TSFlags{28-27} = AltFmtType; + + // XSfmmbase + bit HasTWidenOp = 0; + let TSFlags{29} = HasTWidenOp; + + bit HasTmOp = 0; + let TSFlags{30} = HasTmOp; + + bit HasTkOp = 0; + let TSFlags{31} = HasTkOp; } class RVInst<dag outs, dag ins, string opcodestr, string argstr, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 96e1078..ddb53a2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3005,6 +3005,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, else Ok = RISCVFPRndMode::isValidRoundingMode(Imm); break; + case RISCVOp::OPERAND_XSFMM_VTYPE: + Ok = RISCVVType::isValidXSfmmVType(Imm); + break; } if (!Ok) { ErrInfo = "Invalid immediate"; @@ -3670,6 +3673,11 @@ std::string RISCVInstrInfo::createMIROperandComment( RISCVVType::printVType(Imm, OS); break; } + case RISCVOp::OPERAND_XSFMM_VTYPE: { + unsigned Imm = Op.getImm(); + RISCVVType::printXSfmmVType(Imm, OS); + break; + } case RISCVOp::OPERAND_SEW: case RISCVOp::OPERAND_SEW_MASK: { unsigned Log2SEW = Op.getImm(); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 298d35a..c1b23af 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -128,6 +128,9 @@ defvar TAIL_AGNOSTIC = 1; defvar TU_MU = 0; defvar TA_MU = 1; defvar TA_MA = 3; +defvar DONT_CARE_ALTFMT = 0; +defvar IS_NOT_ALTFMT = 1; +defvar IS_ALTFMT = 2; //===----------------------------------------------------------------------===// // Utilities. @@ -159,7 +162,8 @@ class PseudoToVInst<string PseudoInst> { ["_M4", ""], ["_M8", ""], ["_SE", ""], - ["_RM", ""] + ["_RM", ""], + ["_ALT", ""] ]; string VInst = !foldl(PseudoInst, AffixSubsts, Acc, AffixSubst, !subst(AffixSubst[0], AffixSubst[1], Acc)); @@ -6396,7 +6400,7 @@ let Defs = [VXSAT] in { // 13. Vector Floating-Point Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasVInstructionsAnyF] in { +let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in { //===----------------------------------------------------------------------===// // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions //===----------------------------------------------------------------------===// @@ -6565,7 +6569,7 @@ defm PseudoVFNCVT_F_F : VPseudoVNCVTD_W_RM; defm PseudoVFNCVT_ROD_F_F : VPseudoVNCVTD_W; } // mayRaiseFPException = true -} // Predicates = [HasVInstructionsAnyF] +} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT //===----------------------------------------------------------------------===// // 14. Vector Reduction Operations @@ -6593,7 +6597,7 @@ defm PseudoVWREDSUM : VPseudoVWRED_VS; } } // Predicates = [HasVInstructions] -let Predicates = [HasVInstructionsAnyF] in { +let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in { //===----------------------------------------------------------------------===// // 14.3. Vector Single-Width Floating-Point Reduction Instructions //===----------------------------------------------------------------------===// @@ -6612,7 +6616,7 @@ defm PseudoVFWREDUSUM : VPseudoVFWRED_VS_RM; defm PseudoVFWREDOSUM : VPseudoVFWREDO_VS_RM; } -} // Predicates = [HasVInstructionsAnyF] +} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT //===----------------------------------------------------------------------===// // 15. Vector Mask Instructions @@ -6703,7 +6707,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { // 16.2. Floating-Point Scalar Move Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasVInstructionsAnyF] in { +let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { foreach f = FPList in { let HasSEWOp = 1, BaseInstr = VFMV_F_S in @@ -6718,7 +6722,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>; } } -} // Predicates = [HasVInstructionsAnyF] +} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT //===----------------------------------------------------------------------===// // 16.3. Vector Slide Instructions @@ -6730,10 +6734,10 @@ let Predicates = [HasVInstructions] in { defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX; } // Predicates = [HasVInstructions] -let Predicates = [HasVInstructionsAnyF] in { +let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in { defm PseudoVFSLIDE1UP : VPseudoVSLD1_VF<"@earlyclobber $rd">; defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF; -} // Predicates = [HasVInstructionsAnyF] +} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT //===----------------------------------------------------------------------===// // 16.4. Vector Register Gather Instructions diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 557d873..6a4119a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -438,8 +438,10 @@ let Predicates = [HasVendorXSfvcp] in { } foreach f = FPList in { foreach m = f.MxList in { - defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>; - defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>; + let AltFmtType = IS_NOT_ALTFMT in { + defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>; + defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>; + } } } foreach m = MxListW in { @@ -449,7 +451,8 @@ let Predicates = [HasVendorXSfvcp] in { } foreach f = FPListW in { foreach m = f.MxList in - defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>; + let AltFmtType = IS_NOT_ALTFMT in + defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td index a5ee701..5ad22e6b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td @@ -225,7 +225,7 @@ let Predicates = [HasVendorXSfmmbase] in { def SF_VSETTM : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00001, "sf.vsettm", "$rd, $rs1">; def SF_VSETTK : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00010, - "sf.vsettk", "$rd, $rs1">; + "sf.vsettk", "$rd, $rs1">; def SF_VTDISCARD : SFInstVtDiscard<"sf.vtdiscard">; def SF_VTMV_V_T : SFInstTileMoveOp<0b010000, (outs VR:$vd), (ins GPR:$rs1), @@ -277,3 +277,144 @@ let Uses = [FRM], mayRaiseFPException = true in { } // Predicates = [HasVendorXSfmm32a8f] } // DecoderNamespace = "XSfvector" + +class VPseudoSF_VTileLoad + : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew, + ixlenimm:$twiden)> { + let mayLoad = 1; + let mayStore = 0; + let HasVLOp = 1; // Tn + let HasSEWOp = 1; + let HasTWidenOp = 1; + let hasSideEffects = 1; +} + +class VPseudoSF_VTileStore + : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew, + ixlenimm:$twiden)> { + let mayLoad = 0; + let mayStore = 1; + let HasVLOp = 1; // Tn + let HasSEWOp = 1; + let HasTWidenOp = 1; + let hasSideEffects = 1; +} + +class VPseudoSF_VTileMove_V_T + : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, AVL:$atn, ixlenimm:$sew, + ixlenimm:$twiden)> { + let mayLoad = 0; + let mayStore = 0; + let HasVLOp = 1; // Tn + let HasSEWOp = 1; + let HasTWidenOp = 1; + let hasSideEffects = 1; +} + +class VPseudoSF_VTileMove_T_V + : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, AVL:$atn, ixlenimm:$sew, + ixlenimm:$twiden)> { + let mayLoad = 0; + let mayStore = 0; + let HasVLOp = 1; // Tn + let HasSEWOp = 1; + let HasTWidenOp = 1; + let hasSideEffects = 1; +} + +class VPseudoSF_MatMul<RegisterClass mtd_class> + : RISCVVPseudo<(outs), + (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, AVL:$atm, AVL:$atn, + AVL:$atk, ixlenimm:$sew, ixlenimm:$twiden)> { + let mayLoad = 0; + let mayStore = 0; + let HasTmOp = 1; + let HasVLOp = 1; // Tn + let HasTkOp = 1; + let HasSEWOp = 1; + let HasTWidenOp = 1; + let hasSideEffects = 1; +} + +class VPseudoSF_MatMul_FRM<RegisterClass mtd_class> + : RISCVVPseudo<(outs), + (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, ixlenimm:$frm, + AVL:$atm, AVL:$atn, AVL:$atk, ixlenimm:$sew, + ixlenimm:$twiden), []> { + let mayLoad = 0; + let mayStore = 0; + let HasTmOp = 1; + let HasVLOp = 1; // Tn + let HasTkOp = 1; + let HasSEWOp = 1; + let HasRoundModeOp = 1; + let hasPostISelHook = 1; + let HasTWidenOp = 1; + let hasSideEffects = 1; + let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +let Defs = [VL, VTYPE] in { + def PseudoSF_VSETTNT + : Pseudo<(outs GPR:$rd), + (ins GPRNoX0:$rs1, XSfmmVTypeOp:$vtypei), []>, + PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>, + Sched<[WriteVSETVLI, ReadVSETVLI]>; + def PseudoSF_VSETTNTX0 + : Pseudo<(outs GPRNoX0:$rd), + (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>, + PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>, + Sched<[WriteVSETVLI, ReadVSETVLI]>; + def PseudoSF_VSETTNTX0X0 + : Pseudo<(outs GPRX0:$rd), + (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>, + PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>, + Sched<[WriteVSETVLI, ReadVSETVLI]>; +} + +let Defs = [VTYPE], Uses = [VTYPE], HasTWidenOp = 1, HasSEWOp = 1 in { + def PseudoSF_VSETTM + : Pseudo<(outs GPR:$rd), + (ins GPR:$rs1, ixlenimm:$log2sew, ixlenimm:$twiden), []>, + PseudoInstExpansion<(SF_VSETTM GPR:$rd, GPR:$rs1)>, + Sched<[WriteVSETVLI, ReadVSETVLI]>; + def PseudoSF_VSETTK + : Pseudo<(outs GPR:$rd), + (ins GPR:$rs1, ixlenimm:$logwsew, ixlenimm:$twiden), []>, + PseudoInstExpansion<(SF_VSETTK GPR:$rd, GPR:$rs1)>, + Sched<[WriteVSETVLI, ReadVSETVLI]>; +} +} + +foreach eew = [8, 16, 32, 64] in { + def PseudoSF_VLTE # eew : VPseudoSF_VTileLoad; + def PseudoSF_VSTE # eew : VPseudoSF_VTileStore; +} + +def PseudoSF_VTMV_T_V : VPseudoSF_VTileMove_T_V; +def PseudoSF_VTMV_V_T : VPseudoSF_VTileMove_V_T; + +foreach a = I8Encodes in + foreach b = I8Encodes in + def PseudoSF_MM_ # !toupper(a.Name) # _ # !toupper(b.Name) + : VPseudoSF_MatMul<TRM4>; + +let AltFmtType = IS_NOT_ALTFMT in + def PseudoSF_MM_F_F : VPseudoSF_MatMul_FRM<TRM2>; +let AltFmtType = IS_ALTFMT in + def PseudoSF_MM_F_F_ALT : VPseudoSF_MatMul_FRM<TRM2>; + +foreach e1 = [5, 4] in + foreach e2 = [5, 4] in + def PseudoSF_MM_E # e1 # M # !sub(7, e1) # _E # e2 # M # !sub(7, e2) + : VPseudoSF_MatMul_FRM<TRM4>; + +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in { + let HasVLOp = 1, HasTmOp = 1, HasTWidenOp = 1, HasSEWOp = 1 in + def PseudoSF_VTZERO_T + : RISCVVPseudo<(outs), + (ins TR:$rd, AVL:$atm, AVL:$atn, ixlenimm:$sew, + ixlenimm:$twiden)>; + def PseudoSF_VTDISCARD : RISCVVPseudo<(outs), (ins), []>; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td index 3658817..dcae977 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td +++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td @@ -78,7 +78,41 @@ def isVectorConfigInstr PseudoVSETVLI, PseudoVSETVLIX0, PseudoVSETVLIX0X0, - PseudoVSETIVLI + PseudoVSETIVLI, + PseudoSF_VSETTNT, + PseudoSF_VSETTNTX0, + PseudoSF_VSETTNTX0X0 + ]>>>; + +// Returns true if this is a PseudoSF_VSETTNT* instructions. +def isXSfmmVectorConfigTNInstr + : TIIPredicate<"isXSfmmVectorConfigTNInstr", + MCReturnStatement< + CheckOpcode<[ + PseudoSF_VSETTNT, + PseudoSF_VSETTNTX0, + PseudoSF_VSETTNTX0X0 + ]>>>; + +// Returns true if this is PseudoSF_VSETTM or PseudoSF_VSETTK. +def isXSfmmVectorConfigTMTKInstr + : TIIPredicate<"isXSfmmVectorConfigTMTKInstr", + MCReturnStatement< + CheckOpcode<[ + PseudoSF_VSETTM, + PseudoSF_VSETTK + ]>>>; + +// Returns true if this is a XSfmm vector configuration instruction. +def isXSfmmVectorConfigInstr + : TIIPredicate<"isXSfmmVectorConfigInstr", + MCReturnStatement< + CheckOpcode<[ + PseudoSF_VSETTNT, + PseudoSF_VSETTNTX0, + PseudoSF_VSETTNTX0X0, + PseudoSF_VSETTM, + PseudoSF_VSETTK ]>>>; // Return true if this is 'vsetvli x0, x0, vtype' which preserves diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 40b6416..e9f43b9 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -178,6 +178,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Shadow stack pointer. markSuperRegs(Reserved, RISCV::SSP); + // XSfmmbase + for (MCPhysReg Reg = RISCV::T0; Reg <= RISCV::T15; Reg++) + markSuperRegs(Reserved, Reg); + assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 6472334..47c24fc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -317,6 +317,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom); } + if (Subtarget->hasFP16()) { + setOperationAction(ISD::FMA, MVT::v8f16, Legal); + } + + if (Subtarget->hasRelaxedSIMD()) { + setOperationAction(ISD::FMULADD, MVT::v4f32, Legal); + setOperationAction(ISD::FMULADD, MVT::v2f64, Legal); + } + // Partial MLA reductions. for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) { setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal); @@ -1120,6 +1129,18 @@ WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const { return TargetLoweringBase::getPreferredVectorAction(VT); } +bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd( + const MachineFunction &MF, EVT VT) const { + if (!Subtarget->hasFP16() || !VT.isVector()) + return false; + + EVT ScalarVT = VT.getScalarType(); + if (!ScalarVT.isSimple()) + return false; + + return ScalarVT.getSimpleVT().SimpleTy == MVT::f16; +} + bool WebAssemblyTargetLowering::shouldSimplifyDemandedVectorElts( SDValue Op, const TargetLoweringOpt &TLO) const { // ISel process runs DAGCombiner after legalization; this step is called diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index b33a853..472ec67 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -81,6 +81,8 @@ private: TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const override; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 49af78b..0f6e1ca 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1213,6 +1213,27 @@ defm EXTMUL_LOW_U : defm EXTMUL_HIGH_U : SIMDExtBinary<I64x2, extmul_high_u, "extmul_high_i32x4_u", 0xdf>; +// Pattern for i32x4.dot_i16x8_s +def : Pat< + (v4i32 (add + (wasm_shuffle + (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)), + (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)), + (i32 0), (i32 1), (i32 2), (i32 3), + (i32 8), (i32 9), (i32 10), (i32 11), + (i32 16), (i32 17), (i32 18), (i32 19), + (i32 24), (i32 25), (i32 26), (i32 27)), + (wasm_shuffle + (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)), + (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)), + (i32 4), (i32 5), (i32 6), (i32 7), + (i32 12), (i32 13), (i32 14), (i32 15), + (i32 20), (i32 21), (i32 22), (i32 23), + (i32 28), (i32 29), (i32 30), (i32 31))) + ), + (v4i32 (DOT v8i16:$lhs, v8i16:$rhs)) +>; + //===----------------------------------------------------------------------===// // Floating-point unary arithmetic //===----------------------------------------------------------------------===// @@ -1626,7 +1647,8 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero, // Relaxed (Negative) Multiply-Add (madd/nmadd) //===----------------------------------------------------------------------===// -multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> { +multiclass RELAXED_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, + list<Predicate> reqs> { defm MADD_#vec : SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), [(set (vec.vt V128:$dst), (int_wasm_relaxed_madd @@ -1640,16 +1662,46 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c", vec.prefix#".relaxed_nmadd", simdopS, reqs>; - def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), - (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + def : Pat<(fadd_contract (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b)), (vec.vt V128:$c)), + (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>; + def : Pat<(fmuladd (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)), + (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>; - def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), - (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + def : Pat<(fsub_contract (vec.vt V128:$c), (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b))), + (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>; + def : Pat<(fmuladd (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)), + (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>; } -defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; -defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>; -defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>; +defm "" : RELAXED_SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; +defm "" : RELAXED_SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>; + +//===----------------------------------------------------------------------===// +// FP16 (Negative) Multiply-Add (madd/nmadd) +//===----------------------------------------------------------------------===// + +multiclass HALF_PRECISION_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, + list<Predicate> reqs> { + defm MADD_#vec : + SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), + [(set (vec.vt V128:$dst), (fma + (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], + vec.prefix#".madd\t$dst, $a, $b, $c", + vec.prefix#".madd", simdopA, reqs>; + defm NMADD_#vec : + SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), + [(set (vec.vt V128:$dst), (fma + (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)))], + vec.prefix#".nmadd\t$dst, $a, $b, $c", + vec.prefix#".nmadd", simdopS, reqs>; +} +defm "" : HALF_PRECISION_SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>; + +// TODO: I think separate intrinsics should be introduced for these FP16 operations. +def : Pat<(v8f16 (int_wasm_relaxed_madd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))), + (MADD_F16x8 V128:$a, V128:$b, V128:$c)>; +def : Pat<(v8f16 (int_wasm_relaxed_nmadd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))), + (NMADD_F16x8 V128:$a, V128:$b, V128:$c)>; //===----------------------------------------------------------------------===// // Laneselect |