1 files changed, 183 insertions, 107 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e1d7a07..0b54513 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -52,6 +52,11 @@ static const MachineMemOperand::Flags MOLastUse =
 static const MachineMemOperand::Flags MOCooperative =
     MachineMemOperand::MOTargetFlag3;
 
+/// Mark the MMO of accesses to memory locations that are
+/// never written to by other threads.
+static const MachineMemOperand::Flags MOThreadPrivate =
+    MachineMemOperand::MOTargetFlag4;
+
 /// Utility to store machine instructions worklist.
 struct SIInstrWorklist {
   SIInstrWorklist() = default;
@@ -125,6 +130,11 @@ public:
       unsigned SubIdx, const TargetRegisterClass *SubRC) const;
 
 private:
+  bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+                   bool NeedInversion) const;
+
+  bool invertSCCUse(MachineInstr *SCCDef) const;
+
   void swapOperands(MachineInstr &Inst) const;
 
   std::pair<bool, MachineBasicBlock *>
@@ -136,6 +146,8 @@ private:
 
   void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
 
+  void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+
   void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
 
   void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,
@@ -172,7 +184,7 @@ private:
   void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
                                     SIInstrWorklist &Worklist) const;
 
-  void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+  void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                     MachineInstr &SCCDefInst,
                                     SIInstrWorklist &Worklist,
                                     Register NewCond = Register()) const;
@@ -296,6 +308,8 @@ public:
   bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
                                int64_t &ImmVal) const override;
 
+  std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
+
   unsigned getVectorRegSpillSaveOpcode(Register Reg,
                                        const TargetRegisterClass *RC,
                                        unsigned Size,
@@ -307,22 +321,20 @@ public:
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
+      unsigned SubReg = 0,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
   // instructions. Returns a pair of generated instructions.
@@ -426,6 +438,9 @@ public:
 
   void removeModOperands(MachineInstr &MI) const;
 
+  void mutateAndCleanupImplicit(MachineInstr &MI,
+                                const MCInstrDesc &NewDesc) const;
+
   /// Return the extracted immediate value in a subregister use from a constant
   /// materialized in a super register.
   ///
@@ -451,7 +466,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SALU;
   }
 
-  bool isSALU(uint16_t Opcode) const {
+  bool isSALU(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SALU;
   }
 
@@ -459,7 +474,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VALU;
   }
 
-  bool isVALU(uint16_t Opcode) const {
+  bool isVALU(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VALU;
   }
 
@@ -467,7 +482,7 @@ public:
     return isMIMG(MI) || isVSAMPLE(MI) || isVIMAGE(MI);
   }
 
-  bool isImage(uint16_t Opcode) const {
+  bool isImage(uint32_t Opcode) const {
     return isMIMG(Opcode) || isVSAMPLE(Opcode) || isVIMAGE(Opcode);
   }
 
@@ -475,7 +490,7 @@ public:
     return isMUBUF(MI) || isMTBUF(MI) || isImage(MI) || isFLAT(MI);
   }
 
-  bool isVMEM(uint16_t Opcode) const {
+  bool isVMEM(uint32_t Opcode) const {
     return isMUBUF(Opcode) || isMTBUF(Opcode) || isImage(Opcode);
   }
 
@@ -483,7 +498,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
   }
 
-  bool isSOP1(uint16_t Opcode) const {
+  bool isSOP1(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP1;
   }
 
@@ -491,7 +506,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
   }
 
-  bool isSOP2(uint16_t Opcode) const {
+  bool isSOP2(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP2;
   }
 
@@ -499,7 +514,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
   }
 
-  bool isSOPC(uint16_t Opcode) const {
+  bool isSOPC(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPC;
   }
 
@@ -507,7 +522,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
   }
 
-  bool isSOPK(uint16_t Opcode) const {
+  bool isSOPK(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPK;
   }
 
@@ -515,7 +530,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
   }
 
-  bool isSOPP(uint16_t Opcode) const {
+  bool isSOPP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPP;
   }
 
@@ -523,7 +538,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsPacked;
   }
 
-  bool isPacked(uint16_t Opcode) const {
+  bool isPacked(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsPacked;
   }
 
@@ -531,7 +546,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
   }
 
-  bool isVOP1(uint16_t Opcode) const {
+  bool isVOP1(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP1;
   }
 
@@ -539,7 +554,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
   }
 
-  bool isVOP2(uint16_t Opcode) const {
+  bool isVOP2(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP2;
   }
 
@@ -549,13 +564,13 @@ public:
 
   static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); }
 
-  bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); }
+  bool isVOP3(uint32_t Opcode) const { return isVOP3(get(Opcode)); }
 
   static bool isSDWA(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
   }
 
-  bool isSDWA(uint16_t Opcode) const {
+  bool isSDWA(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SDWA;
   }
 
@@ -563,7 +578,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
   }
 
-  bool isVOPC(uint16_t Opcode) const {
+  bool isVOPC(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOPC;
   }
 
@@ -571,7 +586,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
   }
 
-  bool isMUBUF(uint16_t Opcode) const {
+  bool isMUBUF(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
   }
 
@@ -579,15 +594,19 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
   }
 
-  bool isMTBUF(uint16_t Opcode) const {
+  bool isMTBUF(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
   }
 
+  static bool isBUF(const MachineInstr &MI) {
+    return isMUBUF(MI) || isMTBUF(MI);
+  }
+
   static bool isSMRD(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
   }
 
-  bool isSMRD(uint16_t Opcode) const {
+  bool isSMRD(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SMRD;
   }
 
@@ -597,33 +616,35 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::DS;
   }
 
-  bool isDS(uint16_t Opcode) const {
+  bool isDS(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
   static bool isLDSDMA(const MachineInstr &MI) {
-    return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+    return (isVALU(MI) && (isMUBUF(MI) || isFLAT(MI))) ||
+           (MI.getDesc().TSFlags & SIInstrFlags::TENSOR_CNT);
   }
 
-  bool isLDSDMA(uint16_t Opcode) {
-    return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+  bool isLDSDMA(uint32_t Opcode) {
+    return (isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode))) ||
+           (get(Opcode).TSFlags & SIInstrFlags::TENSOR_CNT);
   }
 
   static bool isGWS(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::GWS;
   }
 
-  bool isGWS(uint16_t Opcode) const {
+  bool isGWS(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::GWS;
   }
 
-  bool isAlwaysGDS(uint16_t Opcode) const;
+  bool isAlwaysGDS(uint32_t Opcode) const;
 
   static bool isMIMG(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
   }
 
-  bool isMIMG(uint16_t Opcode) const {
+  bool isMIMG(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MIMG;
   }
 
@@ -631,7 +652,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VIMAGE;
   }
 
-  bool isVIMAGE(uint16_t Opcode) const {
+  bool isVIMAGE(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VIMAGE;
   }
 
@@ -639,7 +660,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VSAMPLE;
   }
 
-  bool isVSAMPLE(uint16_t Opcode) const {
+  bool isVSAMPLE(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VSAMPLE;
   }
 
@@ -647,7 +668,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
   }
 
-  bool isGather4(uint16_t Opcode) const {
+  bool isGather4(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::Gather4;
   }
 
@@ -662,7 +683,7 @@ public:
     return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
   }
 
-  bool isSegmentSpecificFLAT(uint16_t Opcode) const {
+  bool isSegmentSpecificFLAT(uint32_t Opcode) const {
     auto Flags = get(Opcode).TSFlags;
     return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
   }
@@ -671,7 +692,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FlatGlobal;
   }
 
-  bool isFLATGlobal(uint16_t Opcode) const {
+  bool isFLATGlobal(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FlatGlobal;
   }
 
@@ -679,20 +700,20 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FlatScratch;
   }
 
-  bool isFLATScratch(uint16_t Opcode) const {
+  bool isFLATScratch(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FlatScratch;
   }
 
   // Any FLAT encoded instruction, including global_* and scratch_*.
-  bool isFLAT(uint16_t Opcode) const {
+  bool isFLAT(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
   }
 
-  /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
-  /// SCRATCH_ memory operands.
+  /// \returns true for SCRATCH_ instructions, or FLAT/BUF instructions unless
+  /// the MMOs do not include scratch.
   /// Conservatively correct; will return true if \p MI cannot be proven
   /// to not hit scratch.
-  bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+  bool mayAccessScratch(const MachineInstr &MI) const;
 
   /// \returns true for FLAT instructions that can access VMEM.
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
@@ -700,7 +721,7 @@ public:
   /// \returns true for FLAT instructions that can access LDS.
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
 
-  static bool isBlockLoadStore(uint16_t Opcode) {
+  static bool isBlockLoadStore(uint32_t Opcode) {
     switch (Opcode) {
     case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
     case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
@@ -714,6 +735,52 @@ public:
     }
   }
 
+  static bool setsSCCIfResultIsNonZero(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::S_ABSDIFF_I32:
+    case AMDGPU::S_ABS_I32:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_ANDN2_B32:
+    case AMDGPU::S_ANDN2_B64:
+    case AMDGPU::S_ASHR_I32:
+    case AMDGPU::S_ASHR_I64:
+    case AMDGPU::S_BCNT0_I32_B32:
+    case AMDGPU::S_BCNT0_I32_B64:
+    case AMDGPU::S_BCNT1_I32_B32:
+    case AMDGPU::S_BCNT1_I32_B64:
+    case AMDGPU::S_BFE_I32:
+    case AMDGPU::S_BFE_I64:
+    case AMDGPU::S_BFE_U32:
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_LSHL_B32:
+    case AMDGPU::S_LSHL_B64:
+    case AMDGPU::S_LSHR_B32:
+    case AMDGPU::S_LSHR_B64:
+    case AMDGPU::S_NAND_B32:
+    case AMDGPU::S_NAND_B64:
+    case AMDGPU::S_NOR_B32:
+    case AMDGPU::S_NOR_B64:
+    case AMDGPU::S_NOT_B32:
+    case AMDGPU::S_NOT_B64:
+    case AMDGPU::S_OR_B32:
+    case AMDGPU::S_OR_B64:
+    case AMDGPU::S_ORN2_B32:
+    case AMDGPU::S_ORN2_B64:
+    case AMDGPU::S_QUADMASK_B32:
+    case AMDGPU::S_QUADMASK_B64:
+    case AMDGPU::S_WQM_B32:
+    case AMDGPU::S_WQM_B64:
+    case AMDGPU::S_XNOR_B32:
+    case AMDGPU::S_XNOR_B64:
+    case AMDGPU::S_XOR_B32:
+    case AMDGPU::S_XOR_B64:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   static bool isEXP(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::EXP;
   }
@@ -726,7 +793,7 @@ public:
            Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND1;
   }
 
-  bool isEXP(uint16_t Opcode) const {
+  bool isEXP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::EXP;
   }
 
@@ -734,7 +801,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet;
   }
 
-  bool isAtomicNoRet(uint16_t Opcode) const {
+  bool isAtomicNoRet(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet;
   }
 
@@ -742,7 +809,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet;
   }
 
-  bool isAtomicRet(uint16_t Opcode) const {
+  bool isAtomicRet(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet;
   }
 
@@ -751,13 +818,17 @@ public:
                                    SIInstrFlags::IsAtomicNoRet);
   }
 
-  bool isAtomic(uint16_t Opcode) const {
+  bool isAtomic(uint32_t Opcode) const {
     return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet |
                                   SIInstrFlags::IsAtomicNoRet);
   }
 
   static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
-    return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+    unsigned Opc = MI.getOpcode();
+    // Exclude instructions that read FROM LDS (not write to it)
+    return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD &&
+           Opc != AMDGPU::TENSOR_STORE_FROM_LDS &&
+           Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2;
   }
 
   static bool isSBarrierSCCWrite(unsigned Opcode) {
@@ -776,7 +847,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }
 
-  bool isWQM(uint16_t Opcode) const {
+  bool isWQM(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::WQM;
   }
 
@@ -784,7 +855,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
   }
 
-  bool isDisableWQM(uint16_t Opcode) const {
+  bool isDisableWQM(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
   }
 
@@ -799,7 +870,7 @@ public:
            (isSpill(MI) && isVALU(MI));
   }
 
-  bool isVGPRSpill(uint16_t Opcode) const {
+  bool isVGPRSpill(uint32_t Opcode) const {
     return Opcode != AMDGPU::SI_SPILL_S32_TO_VGPR &&
            Opcode != AMDGPU::SI_RESTORE_S32_FROM_VGPR &&
            (isSpill(Opcode) && isVALU(Opcode));
@@ -811,13 +882,13 @@ public:
            (isSpill(MI) && isSALU(MI));
   }
 
-  bool isSGPRSpill(uint16_t Opcode) const {
+  bool isSGPRSpill(uint32_t Opcode) const {
     return Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
            Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
            (isSpill(Opcode) && isSALU(Opcode));
   }
 
-  bool isSpill(uint16_t Opcode) const {
+  bool isSpill(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::Spill;
   }
 
@@ -827,7 +898,7 @@ public:
 
   static bool isSpill(const MachineInstr &MI) { return isSpill(MI.getDesc()); }
 
-  static bool isWWMRegSpillOpcode(uint16_t Opcode) {
+  static bool isWWMRegSpillOpcode(uint32_t Opcode) {
     return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
            Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||
            Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE ||
@@ -843,7 +914,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::DPP;
   }
 
-  bool isDPP(uint16_t Opcode) const {
+  bool isDPP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DPP;
   }
 
@@ -851,7 +922,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::TRANS;
   }
 
-  bool isTRANS(uint16_t Opcode) const {
+  bool isTRANS(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::TRANS;
   }
 
@@ -859,7 +930,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
   }
 
-  bool isVOP3P(uint16_t Opcode) const {
+  bool isVOP3P(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
   }
 
@@ -867,7 +938,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
   }
 
-  bool isVINTRP(uint16_t Opcode) const {
+  bool isVINTRP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
   }
 
@@ -877,14 +948,14 @@ public:
 
   static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); }
 
-  bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); }
+  bool isMAI(uint32_t Opcode) const { return isMAI(get(Opcode)); }
 
   static bool isMFMA(const MachineInstr &MI) {
     return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   }
 
-  bool isMFMA(uint16_t Opcode) const {
+  bool isMFMA(uint32_t Opcode) const {
     return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
            Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
   }
@@ -897,7 +968,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA;
   }
 
-  bool isWMMA(uint16_t Opcode) const {
+  bool isWMMA(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsWMMA;
   }
 
@@ -905,7 +976,7 @@ public:
     return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
   }
 
-  bool isMFMAorWMMA(uint16_t Opcode) const {
+  bool isMFMAorWMMA(uint32_t Opcode) const {
     return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
   }
 
@@ -913,11 +984,11 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
   }
 
-  bool isSWMMAC(uint16_t Opcode) const {
+  bool isSWMMAC(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
   }
 
-  bool isDOT(uint16_t Opcode) const {
+  bool isDOT(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
   }
 
@@ -931,7 +1002,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR;
   }
 
-  bool isLDSDIR(uint16_t Opcode) const {
+  bool isLDSDIR(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::LDSDIR;
   }
 
@@ -939,7 +1010,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VINTERP;
   }
 
-  bool isVINTERP(uint16_t Opcode) const {
+  bool isVINTERP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VINTERP;
   }
 
@@ -955,6 +1026,14 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
   }
 
+  static bool usesASYNC_CNT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::ASYNC_CNT;
+  }
+
+  bool usesASYNC_CNT(uint32_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT;
+  }
+
   // Most sopk treat the immediate as a signed 16-bit, however some
   // use it as unsigned.
   static bool sopkIsZext(unsigned Opcode) {
@@ -971,7 +1050,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE;
   }
 
-  bool isScalarStore(uint16_t Opcode) const {
+  bool isScalarStore(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE;
   }
 
@@ -979,7 +1058,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;
   }
 
-  bool isFixedSize(uint16_t Opcode) const {
+  bool isFixedSize(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
   }
 
@@ -987,7 +1066,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FPClamp;
   }
 
-  bool hasFPClamp(uint16_t Opcode) const {
+  bool hasFPClamp(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FPClamp;
   }
 
@@ -1007,7 +1086,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
   }
 
-  bool usesFPDPRounding(uint16_t Opcode) const {
+  bool usesFPDPRounding(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
   }
 
@@ -1015,7 +1094,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic;
   }
 
-  bool isFPAtomic(uint16_t Opcode) const {
+  bool isFPAtomic(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FPAtomic;
   }
 
@@ -1060,7 +1139,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
   }
 
-  bool doesNotReadTiedSource(uint16_t Opcode) const {
+  bool doesNotReadTiedSource(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
   }
 
@@ -1128,13 +1207,13 @@ public:
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(isCopyInstr(MI));
     Register Dest = MI.getOperand(0).getReg();
-    const MachineFunction &MF = *MI.getParent()->getParent();
+    const MachineFunction &MF = *MI.getMF();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     return !RI.isSGPRReg(MRI, Dest);
   }
 
   bool hasVGPRUses(const MachineInstr &MI) const {
-    const MachineFunction &MF = *MI.getParent()->getParent();
+    const MachineFunction &MF = *MI.getMF();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     return llvm::any_of(MI.explicit_uses(),
                         [&MRI, this](const MachineOperand &MO) {
@@ -1303,7 +1382,7 @@ public:
 
   /// Return the size in bytes of the operand OpNo on the given
   // instruction opcode.
-  unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+  unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const {
     const MCOperandInfo &OpInfo = get(Opcode).operands()[OpNo];
 
     if (OpInfo.RegClass == -1) {
@@ -1515,6 +1594,8 @@ public:
   bool isBasicBlockPrologue(const MachineInstr &MI,
                             Register Reg = Register()) const override;
 
+  bool canAddToBBProlog(const MachineInstr &MI) const;
+
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator InsPt,
                                          const DebugLoc &DL, Register Src,
@@ -1576,10 +1657,6 @@ public:
   /// Return true if this opcode should not be used by codegen.
   bool isAsmOnlyOpcode(int MCOp) const;
 
-  const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const override;
-
   void fixImplicitOperands(MachineInstr &MI) const;
 
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
@@ -1593,22 +1670,21 @@ public:
                            const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
+  const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
+
   InstructionUniformity
-  getInstructionUniformity(const MachineInstr &MI) const override final;
+  getInstructionUniformity(const MachineInstr &MI) const final;
 
   InstructionUniformity
   getGenericInstructionUniformity(const MachineInstr &MI) const;
 
-  const MIRFormatter *getMIRFormatter() const override {
-    if (!Formatter)
-      Formatter = std::make_unique<AMDGPUMIRFormatter>();
-    return Formatter.get();
-  }
+  const MIRFormatter *getMIRFormatter() const override;
 
   static unsigned getDSShaderTypeValue(const MachineFunction &MF);
 
   const TargetSchedModel &getSchedModel() const { return SchedModel; }
 
+  // FIXME: This should be removed
   // Enforce operand's \p OpName even alignment if required by target.
   // This is used if an operand is a 32 bit register but needs to be aligned
   // regardless.
@@ -1641,7 +1717,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
 /// skipping copy like instructions and subreg-manipulation pseudos.
 /// Following another subreg of a reg:subreg isn't supported.
 MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                               MachineRegisterInfo &MRI);
+                               const MachineRegisterInfo &MRI);
 
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
 /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
@@ -1661,86 +1737,86 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
 namespace AMDGPU {
 
   LLVM_READONLY
-  int getVOPe64(uint16_t Opcode);
+  int64_t getVOPe64(uint32_t Opcode);
 
   LLVM_READONLY
-  int getVOPe32(uint16_t Opcode);
+  int64_t getVOPe32(uint32_t Opcode);
 
   LLVM_READONLY
-  int getSDWAOp(uint16_t Opcode);
+  int64_t getSDWAOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int getDPPOp32(uint16_t Opcode);
+  int64_t getDPPOp32(uint32_t Opcode);
 
   LLVM_READONLY
-  int getDPPOp64(uint16_t Opcode);
+  int64_t getDPPOp64(uint32_t Opcode);
 
   LLVM_READONLY
-  int getBasicFromSDWAOp(uint16_t Opcode);
+  int64_t getBasicFromSDWAOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int getCommuteRev(uint16_t Opcode);
+  int64_t getCommuteRev(uint32_t Opcode);
 
   LLVM_READONLY
-  int getCommuteOrig(uint16_t Opcode);
+  int64_t getCommuteOrig(uint32_t Opcode);
 
   LLVM_READONLY
-  int getAddr64Inst(uint16_t Opcode);
+  int64_t getAddr64Inst(uint32_t Opcode);
 
   /// Check if \p Opcode is an Addr64 opcode.
   ///
   /// \returns \p Opcode if it is an Addr64 opcode, otherwise -1.
   LLVM_READONLY
-  int getIfAddr64Inst(uint16_t Opcode);
+  int64_t getIfAddr64Inst(uint32_t Opcode);
 
   LLVM_READONLY
-  int getSOPKOp(uint16_t Opcode);
+  int64_t getSOPKOp(uint32_t Opcode);
 
   /// \returns SADDR form of a FLAT Global instruction given an \p Opcode
   /// of a VADDR form.
   LLVM_READONLY
-  int getGlobalSaddrOp(uint16_t Opcode);
+  int64_t getGlobalSaddrOp(uint32_t Opcode);
 
   /// \returns VADDR form of a FLAT Global instruction given an \p Opcode
   /// of a SADDR form.
   LLVM_READONLY
-  int getGlobalVaddrOp(uint16_t Opcode);
+  int64_t getGlobalVaddrOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int getVCMPXNoSDstOp(uint16_t Opcode);
+  int64_t getVCMPXNoSDstOp(uint32_t Opcode);
 
   /// \returns ST form with only immediate offset of a FLAT Scratch instruction
   /// given an \p Opcode of an SS (SADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSTfromSS(uint16_t Opcode);
+  int64_t getFlatScratchInstSTfromSS(uint32_t Opcode);
 
   /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SVS (SADDR + VADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSVfromSVS(uint16_t Opcode);
+  int64_t getFlatScratchInstSVfromSVS(uint32_t Opcode);
 
   /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SV (VADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSSfromSV(uint16_t Opcode);
+  int64_t getFlatScratchInstSSfromSV(uint32_t Opcode);
 
   /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SS (SADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSVfromSS(uint16_t Opcode);
+  int64_t getFlatScratchInstSVfromSS(uint32_t Opcode);
 
   /// \returns earlyclobber version of a MAC MFMA is exists.
   LLVM_READONLY
-  int getMFMAEarlyClobberOp(uint16_t Opcode);
+  int64_t getMFMAEarlyClobberOp(uint32_t Opcode);
 
   /// \returns Version of an MFMA instruction which uses AGPRs for srcC and
   /// vdst, given an \p Opcode of an MFMA which uses VGPRs for srcC/vdst.
   LLVM_READONLY
-  int getMFMASrcCVDstAGPROp(uint16_t Opcode);
+  int64_t getMFMASrcCVDstAGPROp(uint32_t Opcode);
 
   /// \returns v_cmpx version of a v_cmp instruction.
   LLVM_READONLY
-  int getVCMPXOpFromVCMP(uint16_t Opcode);
+  int64_t getVCMPXOpFromVCMP(uint32_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);