187 files changed, 17235 insertions, 10575 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ce2b4a5..5df11a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -69,7 +69,7 @@ FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
 ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
-  AMDGPUSimplifyLibCallsPass() {}
+  AMDGPUSimplifyLibCallsPass() = default;
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
@@ -298,6 +298,15 @@ private:
   bool GlobalOpt;
 };
 
+void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &);
+extern char &AMDGPULowerExecSyncLegacyPassID;
+ModulePass *createAMDGPULowerExecSyncLegacyPass();
+
+struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> {
+  AMDGPULowerExecSyncPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
 extern char &AMDGPUSwLowerLDSLegacyPassID;
 ModulePass *
@@ -371,13 +380,13 @@ public:
 class AMDGPUAnnotateUniformValuesPass
     : public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
 public:
-  AMDGPUAnnotateUniformValuesPass() {}
+  AMDGPUAnnotateUniformValuesPass() = default;
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> {
 public:
-  SIModeRegisterPass() {}
+  SIModeRegisterPass() = default;
   PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM);
 };
 
@@ -527,7 +536,7 @@ void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
 ImmutablePass *createAMDGPUExternalAAWrapperPass();
 void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
 
-void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
+void initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(PassRegistry &);
 
 ModulePass *createAMDGPUExportKernelRuntimeHandlesLegacyPass();
 void initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(PassRegistry &);
@@ -562,9 +571,13 @@ public:
 void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
 extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
 
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
 struct AMDGPUUniformIntrinsicCombinePass
     : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ea32748..9ad2f2e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -19,69 +19,105 @@ def p4 : PtrValueType<i64, 4>;
 def p5 : PtrValueType<i32, 5>;
 def p6 : PtrValueType<i32, 6>;
 
-//===------------------------------------------------------------===//
-// Subtarget Features (device properties)
-//===------------------------------------------------------------===//
+//===-----------------------------------------------------------------------===//
+// AMDGPU Subtarget Feature (device properties)
+//===----------------------------------------------------------------------===//
 
-def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
-  "FastFMAF32",
-  "true",
-  "Assuming f32 fma is at least as fast as mul + add"
+// Multiclass to define a SubtargetFeature along with optional predicates.
+// Parameters:
+//   - FeatureString: The feature string used in the SubtargetFeature.
+//   - Description: The description of the feature.
+//   - GenPredicate: If 1 (default), generates a Has#NAME predicate.
+//   - GenAssemblerPredicate: If 1 (default), the predicate includes AssemblerPredicate.
+//   - Deps: List of dependent SubtargetFeatures (default empty).
+//
+// Usage:
+//   defm MadMixInsts : AMDGPUSubtargetFeature<"mad-mix-insts", "description">;
+// This generates:
+//   - FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", "HasMadMixInsts", "true", "description">
+//   - HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
+//                      AssemblerPredicate<(any_of FeatureMadMixInsts)>
+//
+// With GenAssemblerPredicate=0:
+//   defm Foo : AMDGPUSubtargetFeature<"foo", "desc", 1, 0>;
+// This generates:
+//   - FeatureFoo : SubtargetFeature<...>
+//   - HasFoo : Predicate<"Subtarget->hasFoo()"> (no AssemblerPredicate)
+//
+// With dependencies:
+//   defm Bar : AMDGPUSubtargetFeature<"bar", "desc", 1, 1, [FeatureFoo]>;
+// This generates:
+//   - FeatureBar : SubtargetFeature<"bar", "HasBar", "true", "desc", [FeatureFoo]>
+//   - HasBar : Predicate + AssemblerPredicate
+multiclass AMDGPUSubtargetFeature<string FeatureString,
+                                  string Description,
+                                  bit GenPredicate = 1,
+                                  bit GenAssemblerPredicate = 1,
+                                  list<SubtargetFeature> Deps = []> {
+  def Feature#NAME : SubtargetFeature<FeatureString,
+    "Has"#NAME,
+    "true",
+    Description,
+    Deps
+  >;
+
+  if GenPredicate then
+    if GenAssemblerPredicate then
+      def Has#NAME
+        : Predicate<"Subtarget->has"#NAME#"()">,
+          AssemblerPredicate<(any_of !cast<SubtargetFeature>("Feature"#NAME))>;
+    else
+      def Has#NAME : Predicate<"Subtarget->has"#NAME#"()">;
+}
+
+defm FastFMAF32 : AMDGPUSubtargetFeature<"fast-fmaf",
+  "Assuming f32 fma is at least as fast as mul + add",
+  /*GenPredicate=*/0
 >;
 
-def FeatureFastDenormalF32 : SubtargetFeature<"fast-denormal-f32",
-  "FastDenormalF32",
-  "true",
-  "Enabling denormals does not cause f32 instructions to run at f64 rates"
+defm FastDenormalF32 : AMDGPUSubtargetFeature<"fast-denormal-f32",
+  "Enabling denormals does not cause f32 instructions to run at f64 rates",
+  /*GenPredicate=*/0
 >;
 
-def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
-  "MIMG_R128",
-  "true",
-  "Support 128-bit texture resources"
+defm MIMG_R128 : AMDGPUSubtargetFeature<"mimg-r128",
+  "Support 128-bit texture resources",
+  /*GenPredicate=*/0
 >;
 
-def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
-  "HalfRate64Ops",
-  "true",
-  "Most fp64 instructions are half rate instead of quarter"
+defm HalfRate64Ops : AMDGPUSubtargetFeature<"half-rate-64-ops",
+  "Most fp64 instructions are half rate instead of quarter",
+  /*GenPredicate=*/0
 >;
 
-def FullRate64Ops : SubtargetFeature<"full-rate-64-ops",
-  "FullRate64Ops",
-  "true",
-  "Most fp64 instructions are full rate"
+defm FullRate64Ops : AMDGPUSubtargetFeature<"full-rate-64-ops",
+  "Most fp64 instructions are full rate",
+  /*GenPredicate=*/0
 >;
 
-def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
-  "FlatAddressSpace",
-  "true",
+defm FlatAddressSpace : AMDGPUSubtargetFeature<"flat-address-space",
   "Support flat address space"
 >;
 
-def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
-  "FlatInstOffsets",
-  "true",
+defm FlatInstOffsets : AMDGPUSubtargetFeature<"flat-inst-offsets",
   "Flat instructions have immediate offset addressing mode"
 >;
 
-def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
-  "FlatGlobalInsts",
-  "true",
+defm FlatGlobalInsts : AMDGPUSubtargetFeature<"flat-global-insts",
   "Have global_* flat memory instructions",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatAddressSpace]
 >;
 
-def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
-  "FlatScratchInsts",
-  "true",
+defm FlatScratchInsts : AMDGPUSubtargetFeature<"flat-scratch-insts",
   "Have scratch_* flat memory instructions",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatAddressSpace]
 >;
 
-def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
-  "ScalarFlatScratchInsts",
-  "true",
+defm ScalarFlatScratchInsts : AMDGPUSubtargetFeature<"scalar-flat-scratch-insts",
   "Have s_scratch_* flat memory instructions"
 >;
 
@@ -91,100 +127,74 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
   "Use scratch_* flat memory instructions to access scratch"
 >;
 
-def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
-  "FlatGVSMode",
-  "true",
+defm FlatGVSMode : AMDGPUSubtargetFeature<"flat-gvs-mode",
   "Have GVS addressing mode with flat_* instructions",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatAddressSpace]
 >;
 
-def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
-  "AddNoCarryInsts",
-  "true",
+defm AddNoCarryInsts : AMDGPUSubtargetFeature<"add-no-carry-insts",
   "Have VALU add/sub instructions without carry out"
 >;
 
-def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
-  "UnalignedBufferAccess",
-  "true",
+defm UnalignedBufferAccess : AMDGPUSubtargetFeature<"unaligned-buffer-access",
   "Hardware supports unaligned global loads and stores"
 >;
 
-def FeatureTrapHandler: SubtargetFeature<"trap-handler",
-  "TrapHandler",
-  "true",
-  "Trap handler support"
+defm TrapHandler: AMDGPUSubtargetFeature<"trap-handler",
+  "Trap handler support",
+  /*GenPredicate=*/0
 >;
 
-def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
-  "UnalignedScratchAccess",
-  "true",
+defm UnalignedScratchAccess : AMDGPUSubtargetFeature<"unaligned-scratch-access",
   "Support unaligned scratch loads and stores"
 >;
 
-def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
-  "UnalignedDSAccess",
-  "true",
+defm UnalignedDSAccess : AMDGPUSubtargetFeature<"unaligned-ds-access",
   "Hardware supports unaligned local and region loads and stores"
 >;
 
-def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode",
-  "RelaxedBufferOOBMode",
-  "true",
-  "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB"
+defm RelaxedBufferOOBMode : AMDGPUSubtargetFeature<"relaxed-buffer-oob-mode",
+  "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially"
+  "cause an adjacent access to be treated as if it were also OOB"
 >;
 
-def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
-  "HasApertureRegs",
-  "true",
-  "Has Memory Aperture Base and Size Registers"
+defm ApertureRegs : AMDGPUSubtargetFeature<"aperture-regs",
+  "Has Memory Aperture Base and Size Registers",
+  /*GenPredicate=*/0
 >;
 
-def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
-  "HasMadMixInsts",
-  "true",
+defm MadMixInsts : AMDGPUSubtargetFeature<"mad-mix-insts",
   "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
 >;
 
-def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
-  "HasFmaMixInsts",
-  "true",
+defm FmaMixInsts : AMDGPUSubtargetFeature<"fma-mix-insts",
   "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
 >;
 
-def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
-  "HasFmaMixBF16Insts",
-  "true",
+defm FmaMixBF16Insts : AMDGPUSubtargetFeature<"fma-mix-bf16-insts",
   "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
 >;
 
-def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
-  "HasIEEEMinimumMaximumInsts",
-  "true",
-  "Has v_minimum/maximum_f16/f32/f64, v_minimummaximum/maximumminimum_f16/f32 and v_pk_minimum/maximum_f16 instructions"
+defm IEEEMinimumMaximumInsts : AMDGPUSubtargetFeature<"ieee-minimum-maximum-insts",
+  "Has v_minimum/maximum_f16/f32/f64, v_minimummaximum/maximumminimum_f16/f32 and"
+  "v_pk_minimum/maximum_f16 instructions"
 >;
 
-def FeatureMinimum3Maximum3F32 : SubtargetFeature<"minimum3-maximum3-f32",
-  "HasMinimum3Maximum3F32",
-  "true",
+defm Minimum3Maximum3F32 : AMDGPUSubtargetFeature<"minimum3-maximum3-f32",
   "Has v_minimum3_f32 and v_maximum3_f32 instructions"
 >;
 
-def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
-  "HasMinimum3Maximum3F16",
-  "true",
+defm Minimum3Maximum3F16 : AMDGPUSubtargetFeature<"minimum3-maximum3-f16",
   "Has v_minimum3_f16 and v_maximum3_f16 instructions"
 >;
 
-def FeatureMin3Max3PKF16 : SubtargetFeature<"min3-max3-pkf16",
-  "HasMin3Max3PKF16",
-  "true",
+defm Min3Max3PKF16 : AMDGPUSubtargetFeature<"min3-max3-pkf16",
   "Has v_pk_min3_num_f16 and v_pk_max3_num_f16 instructions"
 >;
 
-def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
-  "HasMinimum3Maximum3PKF16",
-  "true",
+defm Minimum3Maximum3PKF16 : AMDGPUSubtargetFeature<"minimum3-maximum3-pkf16",
   "Has v_pk_minimum3_f16 and v_pk_maximum3_f16 instructions"
 >;
 
@@ -223,82 +233,67 @@ def FeaturePreciseMemory
     : SubtargetFeature<"precise-memory", "EnablePreciseMemory",
                        "true", "Enable precise memory mode">;
 
-def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
-  "SGPRInitBug",
-  "true",
+defm SGPRInitBug : AMDGPUSubtargetFeature<"sgpr-init-bug",
   "VI SGPR initialization bug requiring a fixed SGPR allocation size"
 >;
 
-def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug",
-  "UserSGPRInit16Bug",
-  "true",
-  "Bug requiring at least 16 user+system SGPRs to be enabled"
+defm UserSGPRInit16Bug : AMDGPUSubtargetFeature<"user-sgpr-init16-bug",
+  "Bug requiring at least 16 user+system SGPRs to be enabled",
+  /*GenPredicate=*/0
 >;
 
-def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
-  "LDSMisalignedBug",
-  "true",
-  "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode"
+defm LDSMisalignedBug : AMDGPUSubtargetFeature<"lds-misaligned-bug",
+  "Some GFX10 bug with multi-dword LDS and flat access that is not naturally aligned in WGP mode",
+  /*GenPredicate=*/0
 >;
 
-def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
-  "HasMFMAInlineLiteralBug",
-  "true",
-  "MFMA cannot use inline literal as SrcC"
+defm MFMAInlineLiteralBug : AMDGPUSubtargetFeature<"mfma-inline-literal-bug",
+  "MFMA cannot use inline literal as SrcC",
+  /*GenPredicate=*/0
 >;
 
-def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
-  "HasVcmpxPermlaneHazard",
-  "true",
-  "TODO: describe me"
+defm VcmpxPermlaneHazard : AMDGPUSubtargetFeature<"vcmpx-permlane-hazard",
+  "TODO: describe me",
+  /*GenPredicate=*/0
 >;
 
-def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard",
-  "HasVMEMtoScalarWriteHazard",
-  "true",
-  "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution."
+defm VMEMtoScalarWriteHazard : AMDGPUSubtargetFeature<"vmem-to-scalar-write-hazard",
+  "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution.",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard",
-  "HasSMEMtoVectorWriteHazard",
-  "true",
-  "s_load_dword followed by v_cmp page faults"
+defm SMEMtoVectorWriteHazard : AMDGPUSubtargetFeature<"smem-to-vector-write-hazard",
+  "s_load_dword followed by v_cmp page faults",
+  /*GenPredicate=*/0
 >;
 
-def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug",
-  "HasInstFwdPrefetchBug",
-  "true",
-  "S_INST_PREFETCH instruction causes shader to hang"
+defm InstFwdPrefetchBug : AMDGPUSubtargetFeature<"inst-fwd-prefetch-bug",
+  "S_INST_PREFETCH instruction causes shader to hang",
+  /*GenPredicate=*/0
 >;
 
-def FeatureVmemPrefInsts : SubtargetFeature<"vmem-pref-insts",
-  "HasVmemPrefInsts",
-  "true",
+defm VmemPrefInsts : AMDGPUSubtargetFeature<"vmem-pref-insts",
   "Has flat_prefect_b8 and global_prefetch_b8 instructions"
 >;
 
-def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
-  "HasSafeSmemPrefetch",
-  "true",
-  "SMEM prefetches do not fail on illegal address"
+defm SafeSmemPrefetch : AMDGPUSubtargetFeature<"safe-smem-prefetch",
+  "SMEM prefetches do not fail on illegal address",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
-  "HasSafeCUPrefetch",
-  "true",
-  "VMEM CU scope prefetches do not fail on illegal address"
+defm SafeCUPrefetch : AMDGPUSubtargetFeature<"safe-cu-prefetch",
+  "VMEM CU scope prefetches do not fail on illegal address",
+  /*GenPredicate=*/0
 >;
 
-def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
-  "HasVcmpxExecWARHazard",
-  "true",
-  "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)"
+defm VcmpxExecWARHazard : AMDGPUSubtargetFeature<"vcmpx-exec-war-hazard",
+  "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)",
+  /*GenPredicate=*/0
 >;
 
-def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard",
-  "HasLdsBranchVmemWARHazard",
-  "true",
-  "Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
+defm LdsBranchVmemWARHazard : AMDGPUSubtargetFeature<"lds-branch-vmem-war-hazard",
+  "Switching between LDS and VMEM-tex not waiting VM_VSRC=0",
+  /*GenPredicate=*/0
 >;
 
 class FeatureMaxHardClauseLength<int size> : SubtargetFeature<
@@ -316,70 +311,60 @@ def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>;
 /// permitted clause length.
 def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>;
 
-def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
-  "HasNSAtoVMEMBug",
-  "true",
-  "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero"
+defm NSAtoVMEMBug : AMDGPUSubtargetFeature<"nsa-to-vmem-bug",
+  "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero",
+  /*GenPredicate=*/0
 >;
 
-def FeatureNSAClauseBug : SubtargetFeature<"nsa-clause-bug",
-  "HasNSAClauseBug",
-  "true",
-  "MIMG-NSA in a hard clause has unpredictable results on GFX10.1"
+defm NSAClauseBug : AMDGPUSubtargetFeature<"nsa-clause-bug",
+  "MIMG-NSA in a hard clause has unpredictable results on GFX10.1",
+  /*GenPredicate=*/0
 >;
 
-def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
-  "HasFlatSegmentOffsetBug",
-  "true",
-  "GFX10 bug where inst_offset is ignored when flat instructions access global memory"
+defm FlatSegmentOffsetBug : AMDGPUSubtargetFeature<"flat-segment-offset-bug",
+  "GFX10 bug where inst_offset is ignored when flat instructions access global memory",
+  /*GenPredicate=*/0
 >;
 
-def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug",
-  "NegativeScratchOffsetBug",
-  "true",
+defm NegativeScratchOffsetBug : AMDGPUSubtargetFeature<"negative-scratch-offset-bug",
   "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9"
 >;
 
-def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug",
-  "NegativeUnalignedScratchOffsetBug",
-  "true",
-  "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10"
+defm NegativeUnalignedScratchOffsetBug : AMDGPUSubtargetFeature<"negative-unaligned-scratch-offset-bug",
+  "Scratch instructions with a VGPR offset and a negative immediate offset that"
+  "is not a multiple of 4 read wrong memory on GFX10",
+  /*GenPredicate=*/0
 >;
 
-def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
-  "HasOffset3fBug",
-  "true",
-  "Branch offset of 3f hardware bug"
+defm Offset3fBug : AMDGPUSubtargetFeature<"offset-3f-bug",
+  "Branch offset of 3f hardware bug",
+  /*GenPredicate=*/0
 >;
 
-def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
-  "HasImageStoreD16Bug",
-  "true",
-  "Image Store D16 hardware bug"
+defm ImageStoreD16Bug : AMDGPUSubtargetFeature<"image-store-d16-bug",
+  "Image Store D16 hardware bug",
+  /*GenPredicate=*/0
 >;
 
-def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
-  "HasImageGather4D16Bug",
-  "true",
-  "Image Gather4 D16 hardware bug"
+defm ImageGather4D16Bug : AMDGPUSubtargetFeature<"image-gather4-d16-bug",
+  "Image Gather4 D16 hardware bug",
+  /*GenPredicate=*/0
 >;
 
-def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug",
-  "HasMADIntraFwdBug",
-  "true",
-  "MAD_U64/I64 intra instruction forwarding bug"
+defm MADIntraFwdBug : AMDGPUSubtargetFeature<"mad-intra-fwd-bug",
+  "MAD_U64/I64 intra instruction forwarding bug",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/0
 >;
 
-def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
-  "HasMSAALoadDstSelBug",
-  "true",
-  "MSAA loads not honoring dst_sel bug"
+defm MSAALoadDstSelBug : AMDGPUSubtargetFeature<"msaa-load-dst-sel-bug",
+  "MSAA loads not honoring dst_sel bug",
+  /*GenPredicate=*/0
 >;
 
-def FeaturePrivEnabledTrap2NopBug : SubtargetFeature<"priv-enabled-trap2-nop-bug",
-  "HasPrivEnabledTrap2NopBug",
-  "true",
-  "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug"
+defm PrivEnabledTrap2NopBug : AMDGPUSubtargetFeature<"priv-enabled-trap2-nop-bug",
+  "Hardware that runs with PRIV=1 interpreting 's_trap 2' as a nop bug",
+  /*GenPredicate=*/0
 >;
 
 class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
@@ -392,28 +377,24 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
 def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
 def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
 
-def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
-  "GCN3Encoding",
-  "true",
-  "Encoding format for VI"
+defm GCN3Encoding : AMDGPUSubtargetFeature<"gcn3-encoding",
+  "Encoding format for VI",
+  /*GenPredicate=*/0
 >;
 
-def FeatureCIInsts : SubtargetFeature<"ci-insts",
-  "CIInsts",
-  "true",
-  "Additional instructions for CI+"
+defm CIInsts : AMDGPUSubtargetFeature<"ci-insts",
+  "Additional instructions for CI+",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX8Insts : SubtargetFeature<"gfx8-insts",
-  "GFX8Insts",
-  "true",
-  "Additional instructions for GFX8+"
+defm GFX8Insts : AMDGPUSubtargetFeature<"gfx8-insts",
+  "Additional instructions for GFX8+",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
-  "GFX9Insts",
-  "true",
-  "Additional instructions for GFX9+"
+defm GFX9Insts : AMDGPUSubtargetFeature<"gfx9-insts",
+  "Additional instructions for GFX9+",
+  /*GenPredicate=*/0
 >;
 
 def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2",
@@ -422,83 +403,72 @@ def FeatureRequiresAlignedVGPRs : SubtargetFeature<"vgpr-align2",
   "VGPR and AGPR tuple operands require even alignment"
 >;
 
-def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts",
-  "GFX90AInsts",
-  "true",
-  "Additional instructions for GFX90A+"
-  // [HasAtomicFMinFMaxF64GlobalInsts, HasAtomicFMinFMaxF64FlatInsts] // TODO
+defm GFX90AInsts : AMDGPUSubtargetFeature<"gfx90a-insts",
+  "Additional instructions for GFX90A+",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
-  "GFX940Insts",
-  "true",
-  "Additional instructions for GFX940+"
+defm GFX940Insts : AMDGPUSubtargetFeature<"gfx940-insts",
+  "Additional instructions for GFX940+",
+  /*GenPredicate=*/0
 >;
 
-def FeaturePermlane16Swap : SubtargetFeature<"permlane16-swap",
-  "HasPermlane16Swap",
-  "true",
+defm Permlane16Swap : AMDGPUSubtargetFeature<"permlane16-swap",
   "Has v_permlane16_swap_b32 instructions"
 >;
 
-def FeaturePermlane32Swap : SubtargetFeature<"permlane32-swap",
-  "HasPermlane32Swap",
-  "true",
+defm Permlane32Swap : AMDGPUSubtargetFeature<"permlane32-swap",
   "Has v_permlane32_swap_b32 instructions"
 >;
 
-def FeatureFP8ConversionScaleInsts : SubtargetFeature<"fp8-cvt-scale-insts",
-  "HasFP8ConversionScaleInsts",
-  "true",
+defm FP8ConversionScaleInsts : AMDGPUSubtargetFeature<"fp8-cvt-scale-insts",
   "Has fp8 conversion scale instructions"
 >;
 
-def FeatureBF8ConversionScaleInsts : SubtargetFeature<"bf8-cvt-scale-insts",
-  "HasBF8ConversionScaleInsts",
-  "true",
+defm BF8ConversionScaleInsts : AMDGPUSubtargetFeature<"bf8-cvt-scale-insts",
   "Has bf8 conversion scale instructions"
 >;
 
-def FeatureFP4ConversionScaleInsts : SubtargetFeature<"fp4-cvt-scale-insts",
-  "HasFP4ConversionScaleInsts",
-  "true",
+defm FP4ConversionScaleInsts : AMDGPUSubtargetFeature<"fp4-cvt-scale-insts",
   "Has fp4 conversion scale instructions"
 >;
 
-def FeatureFP6BF6ConversionScaleInsts : SubtargetFeature<"fp6bf6-cvt-scale-insts",
-  "HasFP6BF6ConversionScaleInsts",
-  "true",
+defm FP6BF6ConversionScaleInsts : AMDGPUSubtargetFeature<"fp6bf6-cvt-scale-insts",
   "Has fp6 and bf6 conversion scale instructions"
 >;
 
-def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts",
-  "HasF16BF16ToFP6BF6ConversionScaleInsts",
-  "true",
+defm F16BF16ToFP6BF6ConversionScaleInsts : AMDGPUSubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts",
   "Has f16bf16 to fp6bf6 conversion scale instructions"
 >;
 
-def FeatureF32ToF16BF16ConversionSRInsts : SubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
-  "HasF32ToF16BF16ConversionSRInsts",
-  "true",
+defm F32ToF16BF16ConversionSRInsts : AMDGPUSubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
   "Has f32 to f16bf16 conversion scale instructions"
 >;
 
-def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
-  "HasAshrPkInsts",
-  "true",
+defm AshrPkInsts : AMDGPUSubtargetFeature<"ashr-pk-insts",
   "Has Arithmetic Shift Pack instructions"
 >;
 
-def FeatureCvtPkF16F32Inst : SubtargetFeature<"cvt-pk-f16-f32-inst",
-  "HasCvtPkF16F32Inst",
-  "true",
+defm CvtPkF16F32Inst : AMDGPUSubtargetFeature<"cvt-pk-f16-f32-inst",
   "Has cvt_pk_f16_f32 instruction"
 >;
 
-def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
-  "GFX950Insts",
-  "true",
+defm McastLoadInsts : AMDGPUSubtargetFeature<"mcast-load-insts",
+  "Has multicast load instructions"
+>;
+
+defm SWakeupImm : AMDGPUSubtargetFeature<"s-wakeup-imm",
+  "s_wakeup takes an immediate operand"
+>;
+
+defm SBarrierLeaveImm : AMDGPUSubtargetFeature<"s-barrier-leave-imm",
+  "s_barrier_leave takes an immediate operand"
+>;
+
+defm GFX950Insts : AMDGPUSubtargetFeature<"gfx950-insts",
   "Additional instructions for GFX950+",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeaturePermlane16Swap,
    FeaturePermlane32Swap,
    FeatureAshrPkInsts,
@@ -514,63 +484,59 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
    ]
 >;
 
-def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
-  "GFX10Insts",
-  "true",
-  "Additional instructions for GFX10+"
+defm GFX10Insts : AMDGPUSubtargetFeature<"gfx10-insts",
+  "Additional instructions for GFX10+",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts",
-  "GFX11Insts",
-  "true",
-  "Additional instructions for GFX11+"
+defm GFX11Insts : AMDGPUSubtargetFeature<"gfx11-insts",
+  "Additional instructions for GFX11+",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX12Insts : SubtargetFeature<"gfx12-insts",
-  "GFX12Insts",
-  "true",
-  "Additional instructions for GFX12+"
+defm GFX12Insts : AMDGPUSubtargetFeature<"gfx12-insts",
+  "Additional instructions for GFX12+",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX1250Insts : SubtargetFeature<"gfx1250-insts",
-  "GFX1250Insts",
-  "true",
-  "Additional instructions for GFX1250+"
+defm GFX1250Insts : AMDGPUSubtargetFeature<"gfx1250-insts",
+  "Additional instructions for GFX1250+",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
-  "GFX10_3Insts",
-  "true",
-  "Additional instructions for GFX10.3"
+defm GFX13Insts : AMDGPUSubtargetFeature<"gfx13-insts",
+  "Additional instructions for GFX13+",
+  /*GenPredicate=*/0,
+  /*GenAssemblerPredicate=*/0,
+  [FeatureSWakeupImm,
+   FeatureSBarrierLeaveImm,
+  ]
 >;
 
-def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
-  "GFX7GFX8GFX9Insts",
-  "true",
-  "Instructions shared in GFX7, GFX8, GFX9"
+defm GFX10_3Insts : AMDGPUSubtargetFeature<"gfx10-3-insts",
+  "Additional instructions for GFX10.3",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
-  "HasSMemRealTime",
-  "true",
+defm GFX7GFX8GFX9Insts : AMDGPUSubtargetFeature<"gfx7-gfx8-gfx9-insts",
+  "Instructions shared in GFX7, GFX8, GFX9",
+  /*GenPredicate=*/0
+>;
+
+defm SMemRealTime : AMDGPUSubtargetFeature<"s-memrealtime",
   "Has s_memrealtime instruction"
 >;
 
-def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm",
-  "HasInv2PiInlineImm",
-  "true",
-  "Has 1 / (2 * pi) as inline immediate"
+defm Inv2PiInlineImm : AMDGPUSubtargetFeature<"inv-2pi-inline-imm",
+  "Has 1 / (2 * pi) as inline immediate",
+  /*GenPredicate=*/0
 >;
 
-def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
-  "Has16BitInsts",
-  "true",
+defm 16BitInsts : AMDGPUSubtargetFeature<"16-bit-insts",
   "Has i16/f16 instructions"
 >;
 
-def FeatureTrue16BitInsts : SubtargetFeature<"true16",
-  "HasTrue16BitInsts",
-  "true",
+defm True16BitInsts : AMDGPUSubtargetFeature<"true16",
   "True 16-bit operand instructions"
 >;
 
@@ -580,100 +546,75 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
   "Use true 16-bit registers"
 >;
 
-def FeatureD16Writes32BitVgpr : SubtargetFeature<"d16-write-vgpr32",
-  "EnableD16Writes32BitVgpr",
-  "true",
+defm D16Writes32BitVgpr : AMDGPUSubtargetFeature<"d16-write-vgpr32",
   "D16 instructions potentially have 32-bit data dependencies"
 >;
 
-def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
-  "HasBF16TransInsts",
-  "true",
+defm BF16TransInsts : AMDGPUSubtargetFeature<"bf16-trans-insts",
   "Has bf16 transcendental instructions"
 >;
 
-def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
-  "HasBF16ConversionInsts",
-  "true",
+defm BF16ConversionInsts : AMDGPUSubtargetFeature<"bf16-cvt-insts",
   "Has bf16 conversion instructions"
 >;
 
-def FeatureBF16PackedInsts : SubtargetFeature<"bf16-pk-insts",
-  "HasBF16PackedInsts",
-  "true",
+defm BF16PackedInsts : AMDGPUSubtargetFeature<"bf16-pk-insts",
   "Has bf16 packed instructions (fma, add, mul, max, min)"
 >;
 
-def FeatureVOP3P : SubtargetFeature<"vop3p",
-  "HasVOP3PInsts",
-  "true",
+defm VOP3PInsts : AMDGPUSubtargetFeature<"vop3p",
   "Has VOP3P packed instructions"
 >;
 
-def FeatureMovrel : SubtargetFeature<"movrel",
-  "HasMovrel",
-  "true",
+defm Movrel : AMDGPUSubtargetFeature<"movrel",
   "Has v_movrel*_b32 instructions"
 >;
 
-def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
-  "HasVGPRIndexMode",
-  "true",
+defm VGPRIndexMode : AMDGPUSubtargetFeature<"vgpr-index-mode",
   "Has VGPR mode register indexing"
 >;
 
-def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads",
-  "HasScalarDwordx3Loads",
-  "true",
+defm ScalarDwordx3Loads : AMDGPUSubtargetFeature<"scalar-dwordx3-loads",
   "Has 96-bit scalar load instructions"
 >;
 
-def FeatureScalarStores : SubtargetFeature<"scalar-stores",
-  "HasScalarStores",
-  "true",
+defm ScalarStores : AMDGPUSubtargetFeature<"scalar-stores",
   "Has store scalar memory instructions"
 >;
 
-def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics",
-  "HasScalarAtomics",
-  "true",
+defm ScalarAtomics : AMDGPUSubtargetFeature<"scalar-atomics",
   "Has atomic scalar memory instructions"
 >;
 
-def FeatureSDWA : SubtargetFeature<"sdwa",
-  "HasSDWA",
-  "true",
-  "Support SDWA (Sub-DWORD Addressing) extension"
+defm SDWA : AMDGPUSubtargetFeature<"sdwa",
+  "Support SDWA (Sub-DWORD Addressing) extension",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/0
 >;
 
-def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
-  "HasSDWAOmod",
-  "true",
-  "Support OMod with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAOmod : AMDGPUSubtargetFeature<"sdwa-omod",
+  "Support OMod with SDWA (Sub-DWORD Addressing) extension",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
-  "HasSDWAScalar",
-  "true",
-  "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAScalar : AMDGPUSubtargetFeature<"sdwa-scalar",
+  "Support scalar register with SDWA (Sub-DWORD Addressing) extension",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
-  "HasSDWASdst",
-  "true",
-  "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
+defm SDWASdst : AMDGPUSubtargetFeature<"sdwa-sdst",
+  "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
-  "HasSDWAMac",
-  "true",
-  "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAMac : AMDGPUSubtargetFeature<"sdwa-mav",
+  "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc",
-  "HasSDWAOutModsVOPC",
-  "true",
-  "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
+defm SDWAOutModsVOPC : AMDGPUSubtargetFeature<"sdwa-out-mods-vopc",
+  "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension",
+  /*GenPredicate=*/0
 >;
 
 def FeatureDPP : SubtargetFeature<"dpp",
@@ -689,270 +630,227 @@ def FeatureDPP8 : SubtargetFeature<"dpp8",
   "Support DPP8 (Data Parallel Primitives) extension"
 >;
 
-def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
-  "HasDPALU_DPP",
-  "true",
+defm DPALU_DPP : AMDGPUSubtargetFeature<"dpp-64bit",
   "Support DPP (Data Parallel Primitives) extension in DP ALU"
 >;
 
-def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
-  "HasDPPSrc1SGPR",
-  "true",
-  "Support SGPR for Src1 of DPP instructions"
+defm DPPSrc1SGPR : AMDGPUSubtargetFeature<"dpp-src1-sgpr",
+  "Support SGPR for Src1 of DPP instructions",
+  /*GenPredicate=*/0
 >;
 
-def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
-  "HasPackedFP32Ops",
-  "true",
+defm PackedFP32Ops : AMDGPUSubtargetFeature<"packed-fp32-ops",
   "Support packed fp32 instructions"
 >;
 
-def FeatureR128A16 : SubtargetFeature<"r128-a16",
-  "HasR128A16",
-  "true",
-  "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands, where a16 is aliased with r128"
+defm R128A16 : AMDGPUSubtargetFeature<"r128-a16",
+  "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image "
+  "operands, where a16 is aliased with r128"
 >;
 
-def FeatureA16 : SubtargetFeature<"a16",
-  "HasA16",
-  "true",
+defm A16 : AMDGPUSubtargetFeature<"a16",
   "Support A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands"
 >;
 
-def FeatureG16 : SubtargetFeature<"g16",
-  "HasG16",
-  "true",
+defm G16 : AMDGPUSubtargetFeature<"g16",
   "Support G16 for 16-bit gradient image operands"
 >;
 
-def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
-  "HasNSAEncoding",
-  "true",
-  "Support NSA encoding for image instructions"
+defm NSAEncoding : AMDGPUSubtargetFeature<"nsa-encoding",
+  "Support NSA encoding for image instructions",
+  /*GenPredicate=*/0
 >;
 
-def FeaturePartialNSAEncoding : SubtargetFeature<"partial-nsa-encoding",
-  "HasPartialNSAEncoding",
-  "true",
-  "Support partial NSA encoding for image instructions"
+defm PartialNSAEncoding : AMDGPUSubtargetFeature<"partial-nsa-encoding",
+  "Support partial NSA encoding for image instructions",
+  /*GenPredicate=*/0
 >;
 
-def FeatureImageInsts : SubtargetFeature<"image-insts",
-  "HasImageInsts",
-  "true",
+defm ImageInsts : AMDGPUSubtargetFeature<"image-insts",
   "Support image instructions"
 >;
 
-def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts",
-  "HasExtendedImageInsts",
-  "true",
+defm ExtendedImageInsts : AMDGPUSubtargetFeature<"extended-image-insts",
   "Support mips != 0, lod != 0, gather4, and get_lod"
 >;
 
-def FeatureGFX10_AEncoding : SubtargetFeature<"gfx10_a-encoding",
-  "GFX10_AEncoding",
-  "true",
-  "Has BVH ray tracing instructions"
+defm GFX10_AEncoding : AMDGPUSubtargetFeature<"gfx10_a-encoding",
+  "Has BVH ray tracing instructions",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
-  "GFX10_BEncoding",
-  "true",
-  "Encoding format GFX10_B"
+defm GFX10_BEncoding : AMDGPUSubtargetFeature<"gfx10_b-encoding",
+  "Encoding format GFX10_B",
+  /*GenPredicate=*/0
 >;
 
-def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
-  "HasIntClamp",
-  "true",
+defm IntClamp : AMDGPUSubtargetFeature<"int-clamp-insts",
   "Support clamp for integer destination"
 >;
 
-def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
-  "HasUnpackedD16VMem",
-  "true",
+defm UnpackedD16VMem : AMDGPUSubtargetFeature<"unpacked-d16-vmem",
   "Has unpacked d16 vmem instructions"
 >;
 
-def FeatureDLInsts : SubtargetFeature<"dl-insts",
-  "HasDLInsts",
-  "true",
+defm DLInsts : AMDGPUSubtargetFeature<"dl-insts",
   "Has v_fmac_f32 and v_xnor_b32 instructions"
 >;
 
-def FeatureFmacF64Inst : SubtargetFeature<"fmacf64-inst",
-  "HasFmacF64Inst",
-  "true",
+defm FmacF64Inst : AMDGPUSubtargetFeature<"fmacf64-inst",
   "Has v_fmac_f64 instruction"
 >;
 
-def FeatureDot1Insts : SubtargetFeature<"dot1-insts",
-  "HasDot1Insts",
-  "true",
+defm Dot1Insts : AMDGPUSubtargetFeature<"dot1-insts",
   "Has v_dot4_i32_i8 and v_dot8_i32_i4 instructions"
 >;
 
-def FeatureDot2Insts : SubtargetFeature<"dot2-insts",
-  "HasDot2Insts",
-  "true",
+defm Dot2Insts : AMDGPUSubtargetFeature<"dot2-insts",
   "Has v_dot2_i32_i16, v_dot2_u32_u16 instructions"
 >;
 
-def FeatureDot3Insts : SubtargetFeature<"dot3-insts",
-  "HasDot3Insts",
-  "true",
+defm Dot3Insts : AMDGPUSubtargetFeature<"dot3-insts",
   "Has v_dot8c_i32_i4 instruction"
 >;
 
-def FeatureDot4Insts : SubtargetFeature<"dot4-insts",
-  "HasDot4Insts",
-  "true",
+defm Dot4Insts : AMDGPUSubtargetFeature<"dot4-insts",
   "Has v_dot2c_i32_i16 instruction"
 >;
 
-def FeatureDot5Insts : SubtargetFeature<"dot5-insts",
-  "HasDot5Insts",
-  "true",
+defm Dot5Insts : AMDGPUSubtargetFeature<"dot5-insts",
   "Has v_dot2c_f32_f16 instruction"
 >;
 
-def FeatureDot6Insts : SubtargetFeature<"dot6-insts",
-  "HasDot6Insts",
-  "true",
+defm Dot6Insts : AMDGPUSubtargetFeature<"dot6-insts",
   "Has v_dot4c_i32_i8 instruction"
 >;
 
-def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
-  "HasDot7Insts",
-  "true",
+defm Dot7Insts : AMDGPUSubtargetFeature<"dot7-insts",
   "Has v_dot4_u32_u8, v_dot8_u32_u4 instructions"
 >;
 
-def FeatureDot8Insts : SubtargetFeature<"dot8-insts",
-  "HasDot8Insts",
-  "true",
+defm Dot8Insts : AMDGPUSubtargetFeature<"dot8-insts",
   "Has v_dot4_i32_iu8, v_dot8_i32_iu4 instructions"
 >;
 
-def FeatureDot9Insts : SubtargetFeature<"dot9-insts",
-  "HasDot9Insts",
-  "true",
+defm Dot9Insts : AMDGPUSubtargetFeature<"dot9-insts",
   "Has v_dot2_f16_f16, v_dot2_bf16_bf16 instructions"
 >;
 
-def FeatureDot10Insts : SubtargetFeature<"dot10-insts",
-  "HasDot10Insts",
-  "true",
+defm Dot10Insts : AMDGPUSubtargetFeature<"dot10-insts",
   "Has v_dot2_f32_f16 instruction"
 >;
 
-def FeatureDot11Insts : SubtargetFeature<"dot11-insts",
-  "HasDot11Insts",
-  "true",
+defm Dot11Insts : AMDGPUSubtargetFeature<"dot11-insts",
   "Has v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8, v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8 instructions"
 >;
 
-def FeatureDot12Insts : SubtargetFeature<"dot12-insts",
-  "HasDot12Insts",
-  "true",
+defm Dot12Insts : AMDGPUSubtargetFeature<"dot12-insts",
   "Has v_dot2_f32_bf16 instructions"
 >;
 
-def FeatureDot13Insts : SubtargetFeature<"dot13-insts",
-  "HasDot13Insts",
-  "true",
+defm Dot13Insts : AMDGPUSubtargetFeature<"dot13-insts",
   "Has v_dot2c_f32_bf16 instructions"
 >;
 
-
-def FeatureMAIInsts : SubtargetFeature<"mai-insts",
-  "HasMAIInsts",
-  "true",
+defm MAIInsts : AMDGPUSubtargetFeature<"mai-insts",
   "Has mAI instructions"
 >;
 
-def FeatureFP8Insts : SubtargetFeature<"fp8-insts",
-  "HasFP8Insts",
-  "true",
+defm FP8Insts : AMDGPUSubtargetFeature<"fp8-insts",
   "Has fp8 and bf8 instructions"
 >;
 
-def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts",
-  "HasFP8ConversionInsts",
-  "true",
+defm FP8ConversionInsts : AMDGPUSubtargetFeature<"fp8-conversion-insts",
   "Has fp8 and bf8 conversion instructions"
 >;
 
-def FeatureFP8E5M3Insts : SubtargetFeature<"fp8e5m3-insts",
-  "HasFP8E5M3Insts",
-  "true",
+defm FP8E5M3Insts : AMDGPUSubtargetFeature<"fp8e5m3-insts",
   "Has fp8 e5m3 format support"
 >;
 
-def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
-  "HasCvtFP8Vop1Bug",
-  "true",
+defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug",
   "FP8/BF8 VOP1 form of conversion to F32 is unreliable",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/0,
   [FeatureFP8ConversionInsts]
 >;
 
-def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
-  "HasPkFmacF16Inst",
-  "true",
+defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst",
   "Has v_pk_fmac_f16 instruction"
 >;
 
-def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts",
-  "HasAtomicDsPkAdd16Insts",
-  "true",
+defm CubeInsts : AMDGPUSubtargetFeature<"cube-insts",
+  "Has v_cube* instructions"
+>;
+
+defm LerpInst : AMDGPUSubtargetFeature<"lerp-inst",
+  "Has v_lerp_u8 instruction"
+>;
+
+defm SadInsts : AMDGPUSubtargetFeature<"sad-insts",
+  "Has v_sad* instructions"
+>;
+
+defm QsadInsts : AMDGPUSubtargetFeature<"qsad-insts",
+  "Has v_qsad* instructions"
+>;
+
+defm CvtNormInsts : AMDGPUSubtargetFeature<"cvt-norm-insts",
+  "Has v_cvt_norm* instructions"
+>;
+
+defm CvtPkNormVOP2Insts : AMDGPUSubtargetFeature<"cvt-pknorm-vop2-insts",
+  "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions"
+>;
+
+defm CvtPkNormVOP3Insts : AMDGPUSubtargetFeature<"cvt-pknorm-vop3-insts",
+  "Has v_cvt_pk_norm_*f32 instructions/Has v_cvt_pk_norm_*_f16 instructions"
+>;
+
+defm AtomicDsPkAdd16Insts : AMDGPUSubtargetFeature<"atomic-ds-pk-add-16-insts",
   "Has ds_pk_add_bf16, ds_pk_add_f16, ds_pk_add_rtn_bf16, "
   "ds_pk_add_rtn_f16 instructions"
 >;
 
-def FeatureAtomicFlatPkAdd16Insts : SubtargetFeature<"atomic-flat-pk-add-16-insts",
-  "HasAtomicFlatPkAdd16Insts",
-  "true",
+defm AtomicFlatPkAdd16Insts : AMDGPUSubtargetFeature<"atomic-flat-pk-add-16-insts",
   "Has flat_atomic_pk_add_f16 and flat_atomic_pk_add_bf16 instructions"
 >;
 
-def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
-  "HasAtomicFaddRtnInsts",
-  "true",
+defm AtomicFaddRtnInsts : AMDGPUSubtargetFeature<"atomic-fadd-rtn-insts",
   "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
   "return original value",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatGlobalInsts]
 >;
 
-def FeatureAtomicFMinFMaxF32GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f32",
-  "HasAtomicFMinFMaxF32GlobalInsts",
-  "true",
+defm AtomicFMinFMaxF32GlobalInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-global-f32",
   "Has global/buffer instructions for atomicrmw fmin/fmax for float"
 >;
 
-def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-global-f64",
-  "HasAtomicFMinFMaxF64GlobalInsts",
-  "true",
+defm AtomicFMinFMaxF64GlobalInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-global-f64",
   "Has global/buffer instructions for atomicrmw fmin/fmax for float"
 >;
 
-def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
-  "HasAtomicFMinFMaxF32FlatInsts",
-  "true",
+defm AtomicFMinFMaxF32FlatInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-flat-f32",
   "Has flat memory instructions for atomicrmw fmin/fmax for float",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatAddressSpace]
 >;
 
-def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
-  "HasAtomicFMinFMaxF64FlatInsts",
-  "true",
+defm AtomicFMinFMaxF64FlatInsts : AMDGPUSubtargetFeature<"atomic-fmin-fmax-flat-f64",
   "Has flat memory instructions for atomicrmw fmin/fmax for double",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatAddressSpace]
 >;
 
-def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
-  "HasAtomicFaddNoRtnInsts",
-  "true",
+defm AtomicFaddNoRtnInsts : AMDGPUSubtargetFeature<"atomic-fadd-no-rtn-insts",
   "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that "
   "don't return original value",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatGlobalInsts]
 >;
 
@@ -965,46 +863,40 @@ def FeatureAtomicBufferGlobalPkAddF16NoRtnInsts
   [FeatureFlatGlobalInsts]
 >;
 
-def FeatureAtomicBufferGlobalPkAddF16Insts : SubtargetFeature<"atomic-buffer-global-pk-add-f16-insts",
- "HasAtomicBufferGlobalPkAddF16Insts",
- "true",
- "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
- "can return original value",
- [FeatureFlatGlobalInsts]
+defm AtomicBufferGlobalPkAddF16Insts : AMDGPUSubtargetFeature<"atomic-buffer-global-pk-add-f16-insts",
+  "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
+  "can return original value",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
+  [FeatureFlatGlobalInsts]
 >;
 
-def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf16-inst",
- "HasAtomicGlobalPkAddBF16Inst",
- "true",
- "Has global_atomic_pk_add_bf16 instruction",
- [FeatureFlatGlobalInsts]
+defm AtomicGlobalPkAddBF16Inst : AMDGPUSubtargetFeature<"atomic-global-pk-add-bf16-inst",
+  "Has global_atomic_pk_add_bf16 instruction",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
+  [FeatureFlatGlobalInsts]
 >;
 
-def FeatureAtomicBufferPkAddBF16Inst : SubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
- "HasAtomicBufferPkAddBF16Inst",
- "true",
- "Has buffer_atomic_pk_add_bf16 instruction"
+defm AtomicBufferPkAddBF16Inst : AMDGPUSubtargetFeature<"atomic-buffer-pk-add-bf16-inst",
+  "Has buffer_atomic_pk_add_bf16 instruction"
 >;
 
-def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts",
-  "HasAtomicCSubNoRtnInsts",
-  "true",
+defm AtomicCSubNoRtnInsts : AMDGPUSubtargetFeature<"atomic-csub-no-rtn-insts",
   "Has buffer_atomic_csub and global_atomic_csub instructions that don't "
-  "return original value"
+  "return original value",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/0
 >;
 
-def FeatureFlatAtomicFaddF32Inst
-  : SubtargetFeature<"flat-atomic-fadd-f32-inst",
-  "HasFlatAtomicFaddF32Inst",
-  "true",
+defm FlatAtomicFaddF32Inst : AMDGPUSubtargetFeature<"flat-atomic-fadd-f32-inst",
   "Has flat_atomic_add_f32 instruction",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/1,
   [FeatureFlatAddressSpace]
 >;
 
-def FeatureFlatBufferGlobalAtomicFaddF64Inst
-  : SubtargetFeature<"flat-buffer-global-fadd-f64-inst",
-  "HasFlatBufferGlobalAtomicFaddF64Inst",
-  "true",
+defm FlatBufferGlobalAtomicFaddF64Inst : AMDGPUSubtargetFeature<"flat-buffer-global-fadd-f64-inst",
   "Has flat, buffer, and global instructions for f64 atomic fadd"
 >;
 
@@ -1015,33 +907,27 @@ def FeatureMemoryAtomicFAddF32DenormalSupport
   "global/flat/buffer atomic fadd for float supports denormal handling"
 >;
 
-def FeatureAgentScopeFineGrainedRemoteMemoryAtomics
-  : SubtargetFeature<"agent-scope-fine-grained-remote-memory-atomics",
-  "HasAgentScopeFineGrainedRemoteMemoryAtomics",
-  "true",
+defm AgentScopeFineGrainedRemoteMemoryAtomics : AMDGPUSubtargetFeature<
+  "agent-scope-fine-grained-remote-memory-atomics",
   "Agent (device) scoped atomic operations, excluding those directly "
   "supported by PCIe (i.e. integer atomic add, exchange, and "
   "compare-and-swap), are functional for allocations in host or peer "
-  "device memory."
+  "device memory.",
+  /*GenPredicate=*/0
 >;
 
-def FeatureEmulatedSystemScopeAtomics
-  : SubtargetFeature<"emulated-system-scope-atomics",
-  "HasEmulatedSystemScopeAtomics",
-  "true",
+defm EmulatedSystemScopeAtomics : AMDGPUSubtargetFeature<
+  "emulated-system-scope-atomics",
   "System scope atomics unsupported by the PCI-e are emulated in HW via CAS "
-  "loop and functional."
+  "loop and functional.",
+  /*GenPredicate=*/0
 >;
 
-def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
-  "HasDefaultComponentZero",
-  "true",
+defm DefaultComponentZero : AMDGPUSubtargetFeature<"default-component-zero",
   "BUFFER/IMAGE store instructions set unspecified components to zero (before GFX12)"
 >;
 
-def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast",
-  "HasDefaultComponentBroadcast",
-  "true",
+defm DefaultComponentBroadcast : AMDGPUSubtargetFeature<"default-component-broadcast",
   "BUFFER/IMAGE store instructions set unspecified components to x component (GFX12)"
 >;
 
@@ -1057,183 +943,144 @@ def FeatureSRAMECC : SubtargetFeature<"sramecc",
   "Enable SRAMECC"
 >;
 
-def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx",
-  "HasNoSdstCMPX",
-  "true",
+defm NoSdstCMPX : AMDGPUSubtargetFeature<"no-sdst-cmpx",
   "V_CMPX does not write VCC/SGPR in addition to EXEC"
 >;
 
-def FeatureVscnt : SubtargetFeature<"vscnt",
-  "HasVscnt",
-  "true",
-  "Has separate store vscnt counter"
+defm Vscnt : AMDGPUSubtargetFeature<"vscnt",
+  "Has separate store vscnt counter",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst",
-  "HasGetWaveIdInst",
-  "true",
+defm GetWaveIdInst : AMDGPUSubtargetFeature<"get-wave-id-inst",
   "Has s_get_waveid_in_workgroup instruction"
 >;
 
-def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst",
-  "HasSMemTimeInst",
-  "true",
+defm SMemTimeInst : AMDGPUSubtargetFeature<"s-memtime-inst",
   "Has s_memtime instruction"
 >;
 
-def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
-  "HasShaderCyclesRegister",
-  "true",
+defm ShaderCyclesRegister : AMDGPUSubtargetFeature<"shader-cycles-register",
   "Has SHADER_CYCLES hardware register"
 >;
 
-def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers",
-  "HasShaderCyclesHiLoRegisters",
-  "true",
-  "Has SHADER_CYCLES_HI/LO hardware registers"
+defm ShaderCyclesHiLoRegisters : AMDGPUSubtargetFeature<"shader-cycles-hi-lo-registers",
+  "Has SHADER_CYCLES_HI/LO hardware registers",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/0
 >;
 
-def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
-  "HasMadMacF32Insts",
-  "true",
+defm MadMacF32Insts : AMDGPUSubtargetFeature<"mad-mac-f32-insts",
   "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions"
 >;
 
-def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts",
-  "HasDsSrc2Insts",
-  "true",
+defm DsSrc2Insts : AMDGPUSubtargetFeature<"ds-src2-insts",
   "Has ds_*_src2 instructions"
 >;
 
-def FeatureVOP3Literal : SubtargetFeature<"vop3-literal",
-  "HasVOP3Literal",
-  "true",
-  "Can use one literal in VOP3"
+defm VOP3Literal : AMDGPUSubtargetFeature<"vop3-literal",
+  "Can use one literal in VOP3",
+  /*GenPredicate=*/0
 >;
 
-def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
-  "HasNoDataDepHazard",
-  "true",
-  "Does not need SW waitstates"
+defm NoDataDepHazard : AMDGPUSubtargetFeature<"no-data-dep-hazard",
+  "Does not need SW waitstates",
+  /*GenPredicate=*/0
 >;
 
 // Allocate 1536 VGPRs for wave32 and 768 VGPRs for wave64
 // with allocation granularity 24 for wave32 and 12 for wave64
-def Feature1_5xVGPRs : SubtargetFeature<"allocate1_5xvgprs",
-  "Has1_5xVGPRs",
-  "true",
-  "Has 50% more physical VGPRs and 50% larger allocation granule"
+defm 1_5xVGPRs : AMDGPUSubtargetFeature<"allocate1_5xvgprs",
+  "Has 50% more physical VGPRs and 50% larger allocation granule",
+  /*GenPredicate=*/0
 >;
 
-
-def FeatureVOPD : SubtargetFeature<"vopd",
-  "HasVOPDInsts",
-  "true",
-  "Has VOPD dual issue wave32 instructions"
+defm VOPDInsts : AMDGPUSubtargetFeature<"vopd",
+  "Has VOPD dual issue wave32 instructions",
+  /*GenPredicate=*/0
 >;
 
-def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard",
-  "HasVALUTransUseHazard",
-  "true",
-  "Hazard when TRANS instructions are closely followed by a use of the result"
+defm VALUTransUseHazard : AMDGPUSubtargetFeature<"valu-trans-use-hazard",
+  "Hazard when TRANS instructions are closely followed by a use of the result",
+  /*GenPredicate=*/0
 >;
 
-def FeatureSALUFloatInsts : SubtargetFeature<"salu-float",
-  "HasSALUFloatInsts",
-  "true",
+defm SALUFloatInsts : AMDGPUSubtargetFeature<"salu-float",
   "Has SALU floating point instructions"
 >;
 
-def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
-  "HasPseudoScalarTrans",
-  "true",
+defm PseudoScalarTrans : AMDGPUSubtargetFeature<"pseudo-scalar-trans",
   "Has Pseudo Scalar Transcendental instructions"
 >;
 
-def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
-  "HasRestrictedSOffset",
-  "true",
+defm RestrictedSOffset : AMDGPUSubtargetFeature<"restricted-soffset",
   "Has restricted SOffset (immediate not supported)."
 >;
 
-def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
-  "HasRequiredExportPriority",
-  "true",
-  "Export priority must be explicitly manipulated on GFX11.5"
+defm RequiredExportPriority : AMDGPUSubtargetFeature<"required-export-priority",
+  "Export priority must be explicitly manipulated on GFX11.5",
+  /*GenPredicate=*/0
 >;
 
-def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
-  "HasVmemWriteVgprInOrder",
-  "true",
-  "VMEM instructions of the same type write VGPR results in order"
+defm VmemWriteVgprInOrder : AMDGPUSubtargetFeature<"vmem-write-vgpr-in-order",
+  "VMEM instructions of the same type write VGPR results in order",
+  /*GenPredicate=*/0
 >;
 
-def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
-  "HasBitOp3Insts",
-  "true",
+defm BitOp3Insts : AMDGPUSubtargetFeature<"bitop3-insts",
   "Has v_bitop3_b32/v_bitop3_b16 instructions"
 >;
 
-def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
-  "HasTanhInsts",
-  "true",
+defm TanhInsts : AMDGPUSubtargetFeature<"tanh-insts",
   "Has v_tanh_f32/f16 instructions"
 >;
 
-def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts",
-  "HasTensorCvtLutInsts",
-  "true",
+defm TensorCvtLutInsts : AMDGPUSubtargetFeature<"tensor-cvt-lut-insts",
   "Has v_perm_pk16* instructions"
 >;
 
-def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
-  "HasTransposeLoadF4F6Insts",
-  "true",
+defm TransposeLoadF4F6Insts : AMDGPUSubtargetFeature<"transpose-load-f4f6-insts",
   "Has ds_load_tr4/tr6 and global_load_tr4/tr6 instructions"
 >;
 
-def FeaturePrngInst : SubtargetFeature<"prng-inst",
-  "HasPrngInst",
-  "true",
+defm PrngInst : AMDGPUSubtargetFeature<"prng-inst",
   "Has v_prng_b32 instruction"
 >;
 
-def FeatureBVHDualAndBVH8Insts : SubtargetFeature<"bvh-dual-bvh-8-insts",
-  "HasBVHDualAndBVH8Insts",
-  "true",
+defm BVHDualAndBVH8Insts : AMDGPUSubtargetFeature<"bvh-dual-bvh-8-insts",
   "Has image_bvh_dual_intersect_ray and image_bvh8_intersect_ray instructions"
 >;
 
-def FeaturePointSampleAccel : SubtargetFeature<"point-sample-accel",
-  "HasPointSampleAccel",
-  "true",
-  "Has point sample acceleration feature"
+defm PointSampleAccel : AMDGPUSubtargetFeature<"point-sample-accel",
+  "Has point sample acceleration feature",
+  /*GenPredicate=*/0
 >;
 
-def Feature64BitLiterals : SubtargetFeature<"64-bit-literals",
-  "Has64BitLiterals",
-  "true",
+defm 64BitLiterals : AMDGPUSubtargetFeature<"64-bit-literals",
   "Can use 64-bit literals with single DWORD instructions"
 >;
 
-def Feature1024AddressableVGPRs : SubtargetFeature<"1024-addressable-vgprs",
-  "Has1024AddressableVGPRs",
-  "true",
+defm 1024AddressableVGPRs : AMDGPUSubtargetFeature<"1024-addressable-vgprs",
   "Has 1024 addressable VGPRs"
 >;
 
-def FeatureWaitXcnt : SubtargetFeature<"wait-xcnt",
-  "HasWaitXcnt",
-  "true",
+defm SetregVGPRMSBFixup : AMDGPUSubtargetFeature<"setreg-vgpr-msb-fixup",
+  "S_SETREG to MODE clobbers VGPR MSB bits, requires fixup",
+  /*GenPredicate=*/0
+>;
+
+defm WaitXcnt : AMDGPUSubtargetFeature<"wait-xcnt",
   "Has s_wait_xcnt instruction"
 >;
 
-def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
-  "HasSetPrioIncWgInst",
-  "true",
+defm SetPrioIncWgInst : AMDGPUSubtargetFeature<"setprio-inc-wg-inst",
   "Has s_setprio_inc_wg instruction."
 >;
 
+defm SWakeupBarrier : AMDGPUSubtargetFeature<"s-wakeup-barrier-inst",
+  "Has s_wakeup_barrier instruction."
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -1244,11 +1091,9 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
 // wave32 and wave64. Instead what users do is assemble with both
 // wavesizes enabled. We translate this into this special mode so this
 // only influences assembler behavior and nothing else.
-def FeatureAssemblerPermissiveWavesize : SubtargetFeature<
-  "assembler-permissive-wavesize",
-  "AssemblerPermissiveWavesize",
-  "true",
-  "allow parsing wave32 and wave64 variants of instructions"
+defm AssemblerPermissiveWavesize : AMDGPUSubtargetFeature<"assembler-permissive-wavesize",
+  "Allow parsing wave32 and wave64 variants of instructions",
+  /*GenPredicate=*/0
 >;
 
 class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
@@ -1262,12 +1107,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
 def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
 def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
 
-def FeatureDumpCode : SubtargetFeature <"DumpCode",
-  "DumpCode",
-  "true",
-  "Dump MachineInstrs in the CodeEmitter"
->;
-
 def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
   "DumpCode",
   "true",
@@ -1321,74 +1160,64 @@ def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
 // FIXME: moveToVALU should be able to handle converting addr64 MUBUF
 // instructions.
 
-def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
-  "FlatForGlobal",
+def FeatureUseFlatForGlobal : SubtargetFeature<"flat-for-global",
+  "UseFlatForGlobal",
   "true",
   "Force to generate flat instruction for global"
 >;
 
-def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
-  "auto-waitcnt-before-barrier",
-  "AutoWaitcntBeforeBarrier",
-  "true",
-  "Hardware automatically inserts waitcnt before barrier"
+defm AutoWaitcntBeforeBarrier : AMDGPUSubtargetFeature <"auto-waitcnt-before-barrier",
+  "Hardware automatically inserts waitcnt before barrier",
+  /*GenPredicate=*/0
 >;
 
-def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier",
-  "BackOffBarrier",
-  "true",
-  "Hardware supports backing off s_barrier if an exception occurs"
+defm BackOffBarrier : AMDGPUSubtargetFeature <"back-off-barrier",
+  "Hardware supports backing off s_barrier if an exception occurs",
+  /*GenPredicate=*/0
 >;
 
-def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
-  "HasTrigReducedRange",
-  "true",
-  "Requires use of fract on arguments to trig instructions"
+defm TrigReducedRange : AMDGPUSubtargetFeature<"trig-reduced-range",
+  "Requires use of fract on arguments to trig instructions",
+  /*GenPredicate=*/0
 >;
 
-def FeatureKernargPreload : SubtargetFeature <"kernarg-preload",
-  "KernargPreload",
-  "true",
-  "Hardware supports preloading of kernel arguments in user SGPRs."
+defm KernargPreload : AMDGPUSubtargetFeature <"kernarg-preload",
+  "Hardware supports preloading of kernel arguments in user SGPRs.",
+  /*GenPredicate=*/0
 >;
 
 // Alignment enforcement is controlled by a configuration register:
 // SH_MEM_CONFIG.alignment_mode
-def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
-  "UnalignedAccessMode",
-  "true",
+defm UnalignedAccessMode : AMDGPUSubtargetFeature<"unaligned-access-mode",
   "Enable unaligned global, local and region loads and stores if the hardware"
   " supports it"
 >;
 
-def FeaturePackedTID : SubtargetFeature<"packed-tid",
-  "HasPackedTID",
-  "true",
-  "Workitem IDs are packed into v0 at kernel launch"
+defm PackedTID : AMDGPUSubtargetFeature<"packed-tid",
+  "Workitem IDs are packed into v0 at kernel launch",
+  /*GenPredicate=*/0
 >;
 
-def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
-  "HasArchitectedFlatScratch",
-  "true",
-  "Flat Scratch register is a readonly SPI initialized architected register"
+defm ArchitectedFlatScratch : AMDGPUSubtargetFeature<"architected-flat-scratch",
+  "Flat Scratch register is a readonly SPI initialized architected register",
+  /*GenPredicate=*/0
 >;
 
-def FeatureArchitectedSGPRs : SubtargetFeature<"architected-sgprs",
-  "HasArchitectedSGPRs",
-  "true",
-  "Enable the architected SGPRs"
+defm ArchitectedSGPRs : AMDGPUSubtargetFeature<"architected-sgprs",
+  "Enable the architected SGPRs",
+  /*GenPredicate=*/0
 >;
 
-def FeatureGDS : SubtargetFeature<"gds",
-  "HasGDS",
-  "true",
-  "Has Global Data Share"
+defm GDS : AMDGPUSubtargetFeature<"gds",
+  "Has Global Data Share",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/0
 >;
 
-def FeatureGWS : SubtargetFeature<"gws",
-  "HasGWS",
-  "true",
-  "Has Global Wave Sync"
+defm GWS : AMDGPUSubtargetFeature<"gws",
+  "Has Global Wave Sync",
+  /*GenPredicate=*/1,
+  /*GenAssemblerPredicate=*/0
 >;
 
 def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6",
@@ -1397,18 +1226,14 @@ def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6",
   "Target Requires Code Object V6"
 >;
 
-def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
-   "HasXF32Insts",
-   "true",
-   "Has instructions that support xf32 format, such as "
-   "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
- >;
+defm XF32Insts : AMDGPUSubtargetFeature<"xf32-insts",
+  "Has instructions that support xf32 format, such as "
+  "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
+>;
 
-def FeatureGloballyAddressableScratch : SubtargetFeature<
-  "globally-addressable-scratch",
-  "HasGloballyAddressableScratch",
-  "true",
-  "FLAT instructions can access scratch memory for any thread in any wave"
+defm GloballyAddressableScratch : AMDGPUSubtargetFeature<"globally-addressable-scratch",
+  "FLAT instructions can access scratch memory for any thread in any wave",
+  /*GenPredicate=*/0
 >;
 
 // Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
@@ -1419,45 +1244,56 @@ def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
   "Use block load/store for VGPR callee saved registers"
 >;
 
-def FeatureLshlAddU64Inst
-    : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
-                       "Has v_lshl_add_u64 instruction">;
+defm LshlAddU64Inst : AMDGPUSubtargetFeature<"lshl-add-u64-inst",
+  "Has v_lshl_add_u64 instruction"
+>;
 
-def FeatureAddSubU64Insts
-    : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true",
-                       "Has v_add_u64 and v_sub_u64 instructions">;
+defm AddSubU64Insts : AMDGPUSubtargetFeature<"add-sub-u64-insts",
+  "Has v_add_u64 and v_sub_u64 instructions"
+>;
 
-def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
-                                         "true", "Has v_mad_u32 instruction">;
+defm MadU32Inst : AMDGPUSubtargetFeature<"mad-u32-inst",
+  "Has v_mad_u32 instruction"
+>;
 
-def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
-  "HasVMemToLDSLoad",
-  "true",
-  "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds."
+defm AddMinMaxInsts : AMDGPUSubtargetFeature<"add-min-max-insts",
+  "Has v_add_{min|max}_{i|u}32 instructions"
 >;
 
-def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic",
-  "HasLdsBarrierArriveAtomic",
-  "true",
+defm PkAddMinMaxInsts : AMDGPUSubtargetFeature<"pk-add-min-max-insts",
+  "Has v_pk_add_{min|max}_{i|u}16 instructions"
+>;
+
+defm VMemToLDSLoad : AMDGPUSubtargetFeature<"vmem-to-lds-load-insts",
+  "The platform has memory to lds instructions (global_load w/lds bit set, buffer_load"
+  "w/lds bit set or global_load_lds. This does not include scratch_load_lds."
+>;
+
+defm LdsBarrierArriveAtomic : AMDGPUSubtargetFeature<"lds-barrier-arrive-atomic",
   "Has LDS barrier-arrive atomic instructions"
 >;
 
-def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource",
-  "Has45BitNumRecordsBufferResource",
-  "true",
-  "The buffer resource (V#) supports 45-bit num_records"
+defm 45BitNumRecordsBufferResource : AMDGPUSubtargetFeature<"45-bit-num-records-buffer-resource",
+  "The buffer resource (V#) supports 45-bit num_records",
+  /*GenPredicate=*/0
+>;
+
+defm Clusters : AMDGPUSubtargetFeature<"clusters",
+  "Has clusters of workgroups support",
+  /*GenPredicate=*/0
 >;
 
-def FeatureClusters : SubtargetFeature< "clusters",
-  "HasClusters",
+def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature<
+  "waits-before-system-scope-stores",
+  "RequiresWaitsBeforeSystemScopeStores",
   "true",
-  "Has clusters of workgroups support"
+  "Target requires waits for loads and atomics before system scope stores"
 >;
 
-// Dummy feature used to disable assembler instructions.
-def FeatureDisable : SubtargetFeature<"",
-  "FeatureDisable","true",
-  "Dummy feature to disable assembler instructions"
+def FeatureUseAddPC64Inst : SubtargetFeature<"use-add-pc64-inst",
+  "UseAddPC64Inst",
+  "true",
+  "Use s_add_pc_i64 instruction."
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1475,7 +1311,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
   FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
-  FeatureVmemWriteVgprInOrder
+  FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+  FeatureSadInsts, FeatureCvtPkNormVOP2Insts
   ]
 >;
 
@@ -1489,7 +1326,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
   FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
-  FeatureVmemWriteVgprInOrder
+  FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+  FeatureSadInsts, FeatureQsadInsts, FeatureCvtPkNormVOP2Insts
   ]
 >;
 
@@ -1505,7 +1343,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
    FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
    FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
-   FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder
+   FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder, FeatureCubeInsts,
+   FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+   FeatureCvtPkNormVOP2Insts
   ]
 >;
 
@@ -1515,7 +1355,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureWavefrontSize64, FeatureFlatAddressSpace,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
-   FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
+   FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3PInsts, FeatureVGPRIndexMode,
    FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
@@ -1524,7 +1364,10 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
    FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
    FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
-   FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+   FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad,
+   FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+   FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
+   FeatureCvtPkNormVOP3Insts
   ]
 >;
 
@@ -1534,7 +1377,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureFlatAddressSpace,
    FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureInv2PiInlineImm,
-   FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P,
+   FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3PInsts,
    FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
@@ -1548,7 +1391,10 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
    FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
-   FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad
+   FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad, FeatureCubeInsts,
+   FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
+   FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
+   FeatureCvtPkNormVOP3Insts
   ]
 >;
 
@@ -1559,7 +1405,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureInv2PiInlineImm, FeatureApertureRegs,
    FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
    FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
-   FeatureGFX11Insts, FeatureVOP3P, FeatureVOPD, FeatureTrue16BitInsts,
+   FeatureGFX11Insts, FeatureVOP3PInsts, FeatureVOPDInsts, FeatureTrue16BitInsts,
    FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
    FeatureAddNoCarryInsts, FeatureFmaMixInsts,
@@ -1571,7 +1417,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
    FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
-   FeatureVmemWriteVgprInOrder
+   FeatureVmemWriteVgprInOrder, FeatureCubeInsts, FeatureLerpInst,
+   FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts,
+   FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts
   ]
 >;
 
@@ -1582,7 +1430,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureInv2PiInlineImm, FeatureApertureRegs,
    FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
    FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
-   FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3P, FeatureVOPD,
+   FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3PInsts, FeatureVOPDInsts,
    FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
    FeatureAddNoCarryInsts, FeatureFmaMixInsts,
@@ -1599,6 +1447,29 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
   ]
 >;
 
+def FeatureGFX13 : GCNSubtargetFeatureGeneration<"GFX13",
+  "gfx13",
+  [FeatureFP64, FeatureMIMG_R128,
+   FeatureFlatAddressSpace, Feature16BitInsts,
+   FeatureInv2PiInlineImm, FeatureApertureRegs,
+   FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
+   FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
+   FeatureGFX11Insts, FeatureGFX12Insts, FeatureGFX13Insts, FeatureVOP3PInsts,
+   FeatureVOPDInsts, FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
+   FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+   FeatureAddNoCarryInsts, FeatureFmaMixInsts,
+   FeatureNoSdstCMPX, FeatureVscnt,
+   FeatureVOP3Literal, FeatureDPP8,
+   FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
+   FeatureA16, FeatureFastDenormalF32, FeatureG16,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, FeatureImageInsts,
+   FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
+   FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
+   FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
+   FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
+   FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics
+  ]
+>;
 //===----------------------------------------------------------------------===//
 
 class FeatureSet<list<SubtargetFeature> Features_> {
@@ -1607,7 +1478,7 @@ class FeatureSet<list<SubtargetFeature> Features_> {
 
 def FeatureISAVersion6_0_0 : FeatureSet<[FeatureSouthernIslands,
    FeatureFastFMAF32,
-   HalfRate64Ops,
+   FeatureHalfRate64Ops,
    FeatureLDSBankCount32]>;
 
 def FeatureISAVersion6_0_1 : FeatureSet<
@@ -1624,7 +1495,7 @@ def FeatureISAVersion7_0_0 : FeatureSet<
 
 def FeatureISAVersion7_0_1 : FeatureSet<
   [FeatureSeaIslands,
-   HalfRate64Ops,
+   FeatureHalfRate64Ops,
    FeatureLDSBankCount32,
    FeatureFastFMAF32]>;
 
@@ -1653,7 +1524,7 @@ def FeatureISAVersion8_0_Common : FeatureSet<
 def FeatureISAVersion8_0_1 : FeatureSet<
   !listconcat(FeatureISAVersion8_0_Common.Features,
     [FeatureFastFMAF32,
-     HalfRate64Ops,
+     FeatureHalfRate64Ops,
      FeatureSupportsXNACK])>;
 
 def FeatureISAVersion8_0_2 : FeatureSet<
@@ -1724,7 +1595,7 @@ def FeatureISAVersion9_0_4 : FeatureSet<
 
 def FeatureISAVersion9_0_6 : FeatureSet<
   !listconcat(FeatureISAVersion9_0_Consumer_Common.Features,
-    [HalfRate64Ops,
+    [FeatureHalfRate64Ops,
      FeatureFmaMixInsts,
      FeatureDLInsts,
      FeatureDot1Insts,
@@ -1736,7 +1607,7 @@ def FeatureISAVersion9_0_6 : FeatureSet<
 def FeatureISAVersion9_0_8 : FeatureSet<
   !listconcat(FeatureISAVersion9_0_MI_Common.Features,
     [FeatureGDS,
-     HalfRate64Ops,
+     FeatureHalfRate64Ops,
      FeatureDsSrc2Insts,
      FeatureExtendedImageInsts,
      FeatureAtomicBufferGlobalPkAddF16NoRtnInsts,
@@ -1757,7 +1628,7 @@ def FeatureISAVersion9_0_A : FeatureSet<
      FeatureAtomicFaddRtnInsts,
      FeatureAtomicBufferGlobalPkAddF16Insts,
      FeaturePackedTID,
-     FullRate64Ops,
+     FeatureFullRate64Ops,
      FeatureBackOffBarrier,
      FeatureKernargPreload,
      FeatureAtomicFMinFMaxF64GlobalInsts,
@@ -1800,7 +1671,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
    FeatureSupportsSRAMECC,
    FeaturePackedTID,
    FeatureArchitectedFlatScratch,
-   FullRate64Ops,
+   FeatureFullRate64Ops,
    FeatureBackOffBarrier,
    FeatureKernargPreload,
    FeatureAtomicFMinFMaxF64GlobalInsts,
@@ -1861,7 +1732,7 @@ def FeatureISAVersion10_1_Common : FeatureSet<
      FeatureGetWaveIdInst,
      FeatureMadMacF32Insts,
      FeatureDsSrc2Insts,
-     FeatureLdsMisalignedBug,
+     FeatureLDSMisalignedBug,
      FeatureSupportsXNACK,
      // gfx101x bugs
      FeatureVcmpxPermlaneHazard,
@@ -2009,6 +1880,13 @@ def FeatureISAVersion11_5_3 : FeatureSet<
   !listconcat(FeatureISAVersion11_5_Common.Features,
     [])>;
 
+def FeatureISAVersion11_7_0 : FeatureSet<
+  !listconcat(FeatureISAVersion11_Common.Features,
+    [FeatureSALUFloatInsts,
+     FeatureDPPSrc1SGPR,
+     FeatureFP8ConversionInsts,
+     FeatureDot11Insts])>;
+
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
    FeatureBackOffBarrier,
@@ -2042,20 +1920,28 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureVcmpxPermlaneHazard,
    FeatureSALUFloatInsts,
    FeaturePseudoScalarTrans,
-   FeatureHasRestrictedSOffset,
+   FeatureRestrictedSOffset,
    FeatureScalarDwordx3Loads,
    FeatureDPPSrc1SGPR,
    FeatureMaxHardClauseLength32,
    Feature1_5xVGPRs,
    FeatureMemoryAtomicFAddF32DenormalSupport,
-   FeatureBVHDualAndBVH8Insts
+   FeatureBVHDualAndBVH8Insts,
+   FeatureWaitsBeforeSystemScopeStores,
+   FeatureD16Writes32BitVgpr,
+   FeatureCubeInsts,
+   FeatureLerpInst,
+   FeatureSadInsts,
+   FeatureQsadInsts,
+   FeatureCvtNormInsts,
+   FeatureCvtPkNormVOP2Insts,
+   FeatureCvtPkNormVOP3Insts
    ]>;
 
-def FeatureISAVersion12_50 : FeatureSet<
+def FeatureISAVersion12_50_Common : FeatureSet<
   [FeatureGFX12,
    FeatureGFX1250Insts,
    FeatureRequiresAlignedVGPRs,
-   FeatureAddressableLocalMemorySize327680,
    FeatureCuMode,
    Feature1024AddressableVGPRs,
    Feature64BitLiterals,
@@ -2084,7 +1970,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureVcmpxPermlaneHazard,
    FeatureSALUFloatInsts,
    FeaturePseudoScalarTrans,
-   FeatureHasRestrictedSOffset,
+   FeatureRestrictedSOffset,
    FeatureScalarDwordx3Loads,
    FeatureDPPSrc1SGPR,
    FeatureBitOp3Insts,
@@ -2115,22 +2001,107 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureLshlAddU64Inst,
    FeatureAddSubU64Insts,
    FeatureMadU32Inst,
+   FeatureAddMinMaxInsts,
+   FeaturePkAddMinMaxInsts,
    FeatureLdsBarrierArriveAtomic,
    FeatureSetPrioIncWgInst,
+   FeatureSWakeupBarrier,
    Feature45BitNumRecordsBufferResource,
    FeatureSupportsXNACK,
    FeatureXNACK,
    FeatureClusters,
+   FeatureD16Writes32BitVgpr,
+   FeatureMcastLoadInsts
 ]>;
 
+def FeatureISAVersion12_50 : FeatureSet<
+  !listconcat(FeatureISAVersion12_50_Common.Features,
+  [FeatureAddressableLocalMemorySize327680,
+   FeatureSetregVGPRMSBFixup,
+   FeatureCubeInsts,
+   FeatureLerpInst,
+   FeatureSadInsts,
+   FeatureQsadInsts,
+   FeatureCvtNormInsts,
+   FeatureCvtPkNormVOP2Insts,
+   FeatureCvtPkNormVOP3Insts])>;
+
 def FeatureISAVersion12_51 : FeatureSet<
-  !listconcat(FeatureISAVersion12_50.Features,
-  [FeatureDPALU_DPP])>;
+  !listconcat(FeatureISAVersion12_50_Common.Features,
+  [FeatureAddressableLocalMemorySize327680,
+   FeatureDPALU_DPP,
+   FeatureCubeInsts,
+   FeatureLerpInst,
+   FeatureSadInsts,
+   FeatureQsadInsts,
+   FeatureCvtNormInsts,
+   FeatureCvtPkNormVOP2Insts,
+   FeatureCvtPkNormVOP3Insts])>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
   !listconcat(FeatureISAVersion12.Features,
     [FeatureRequiresCOV6])>;
 
+def FeatureISAVersion13 : FeatureSet<
+  [FeatureGFX13,
+   FeatureGFX1250Insts,
+   FeatureAddressableLocalMemorySize65536,
+   Feature64BitLiterals,
+   FeatureLDSBankCount32,
+   FeatureDLInsts,
+   FeatureFmacF64Inst,
+   FeatureDot7Insts,
+   FeatureDot8Insts,
+   FeatureNSAEncoding,
+   FeaturePartialNSAEncoding,
+   FeatureShaderCyclesRegister,
+   FeatureArchitectedFlatScratch,
+   FeatureArchitectedSGPRs,
+   FeatureAtomicFaddRtnInsts,
+   FeatureAtomicFaddNoRtnInsts,
+   FeatureAtomicDsPkAdd16Insts,
+   FeatureAtomicFlatPkAdd16Insts,
+   FeatureAtomicBufferGlobalPkAddF16Insts,
+   FeatureAtomicGlobalPkAddBF16Inst,
+   FeatureAtomicBufferPkAddBF16Inst,
+   FeatureFlatAtomicFaddF32Inst,
+   FeatureFP8ConversionInsts,
+   FeaturePackedTID,
+   FeatureVcmpxPermlaneHazard,
+   FeatureSALUFloatInsts,
+   FeaturePseudoScalarTrans,
+   FeatureRestrictedSOffset,
+   FeatureScalarDwordx3Loads,
+   FeatureDPPSrc1SGPR,
+   FeatureBitOp3Insts,
+   FeatureTanhInsts,
+   FeatureTensorCvtLutInsts,
+   FeatureTransposeLoadF4F6Insts,
+   Feature1_5xVGPRs,
+   FeatureBF16TransInsts,
+   FeatureBF16ConversionInsts,
+   FeatureBF16PackedInsts,
+   FeaturePrngInst,
+   FeaturePermlane16Swap,
+   FeatureAshrPkInsts,
+   FeatureAtomicFMinFMaxF64GlobalInsts,
+   FeatureAtomicFMinFMaxF64FlatInsts,
+   FeatureFmaMixBF16Insts,
+   FeatureGloballyAddressableScratch,
+   FeatureCvtPkF16F32Inst,
+   FeatureF16BF16ToFP6BF6ConversionScaleInsts,
+   FeatureIEEEMinimumMaximumInsts,
+   FeatureSWakeupBarrier,
+   FeatureClusters,
+   FeatureCubeInsts,
+   FeatureLerpInst,
+   FeatureSadInsts,
+   FeatureQsadInsts,
+   FeatureCvtNormInsts,
+   FeatureCvtPkNormVOP2Insts,
+   FeatureCvtPkNormVOP3Insts,
+]>;
+
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
@@ -2303,6 +2274,11 @@ def isGFX8GFX9GFX10GFX11 :
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
   AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX12Insts))>;
 
+def isGFX8GFX9GFX10GFX11GFX12 :
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&"
+            "Subtarget->getGeneration() < AMDGPUSubtarget::GFX13">,
+  AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX13Insts))>;
+
 def isGFX7Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -2360,18 +2336,6 @@ def isNotGFX940Plus :
   Predicate<"!Subtarget->hasGFX940Insts()">,
   AssemblerPredicate<(all_of (not FeatureGFX940Insts))>;
 
-def HasGFX950Insts :
-  Predicate<"Subtarget->hasGFX950Insts()">,
-  AssemblerPredicate<(all_of FeatureGFX950Insts)>;
-
-def HasPermlane16Swap :
-  Predicate<"Subtarget->hasPermlane16Swap()">,
-  AssemblerPredicate<(all_of FeaturePermlane16Swap)>;
-
-def HasPermlane32Swap :
-  Predicate<"Subtarget->hasPermlane32Swap()">,
-  AssemblerPredicate<(all_of FeaturePermlane32Swap)>;
-
 def isGFX8GFX9NotGFX940 :
   Predicate<"!Subtarget->hasGFX940Insts() &&"
             "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
@@ -2425,9 +2389,14 @@ def isGFX11Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">,
   AssemblerPredicate<(all_of FeatureGFX11Insts)>;
 
+def isGFX11PlusNot12_50 :
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&"
+            "(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13 || !Subtarget->hasGFX1250Insts())">,
+  AssemblerPredicate<(all_of FeatureGFX11Insts, (any_of FeatureGFX13Insts, (not FeatureGFX1250Insts)))>;
+
 def isGFX12Only :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12">,
-  AssemblerPredicate<(all_of FeatureGFX12Insts)>;
+  AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX13Insts))>;
 
 def isGFX12Not12_50 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
@@ -2438,12 +2407,13 @@ def isGFX12Plus :
   AssemblerPredicate<(all_of FeatureGFX12Insts)>;
 
 def isGFX12PlusNot12_50 :
-  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
-  AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX1250Insts))>;
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 &&"
+            "(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13 || !Subtarget->hasGFX1250Insts())">,
+  AssemblerPredicate<(all_of FeatureGFX12Insts, (any_of FeatureGFX13Insts, (not FeatureGFX1250Insts)))>;
 
 def isGFX125xOnly :
-  Predicate<"Subtarget->hasGFX1250Insts()">,
-  AssemblerPredicate<(all_of FeatureGFX1250Insts)>;
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && Subtarget->hasGFX1250Insts()">,
+  AssemblerPredicate<(all_of FeatureGFX1250Insts, (not FeatureGFX13Insts))>;
 
 def isGFX1250Plus :
   Predicate<"Subtarget->hasGFX1250Insts()">,
@@ -2454,63 +2424,27 @@ def isNotGFX1250Plus :
   AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>;
 
 def isGFX940orGFX1250 :
-  Predicate<"Subtarget->hasGFX940Insts() || Subtarget->hasGFX1250Insts()">,
-  AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX1250Insts)>;
-
-def HasIEEEMinimumMaximumInsts :
-  Predicate<"Subtarget->hasIEEEMinimumMaximumInsts()">,
-  AssemblerPredicate<(all_of FeatureIEEEMinimumMaximumInsts)>;
-
-def HasMinimum3Maximum3F32 :
-  Predicate<"Subtarget->hasMinimum3Maximum3F32()">,
-  AssemblerPredicate<(all_of FeatureMinimum3Maximum3F32)>;
-
-def HasMinimum3Maximum3F16 :
-  Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
-  AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;
-
-def HasMin3Max3PKF16 :
-  Predicate<"Subtarget->hasMin3Max3PKF16()">,
-  AssemblerPredicate<(all_of FeatureMin3Max3PKF16)>;
-
-def HasMinimum3Maximum3PKF16 :
-  Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
-  AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;
-
+  Predicate<"Subtarget->hasGFX940Insts() ||"
+            "(Subtarget->hasGFX1250Insts() && !Subtarget->hasGFX13Insts())">,
+  AssemblerPredicate<(any_of FeatureGFX940Insts,
+                     (all_of FeatureGFX1250Insts, (not FeatureGFX13Insts)))>;
 
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
-  AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
+def isGFX13Only :
+  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX13">,
+  AssemblerPredicate<(all_of FeatureGFX13Insts)>;
 
-def HasFlatBufferGlobalAtomicFaddF64Inst :
-  Predicate<"Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst()">,
-  AssemblerPredicate<(any_of FeatureFlatBufferGlobalAtomicFaddF64Inst)>;
+def isGFX13Plus :
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13">,
+  AssemblerPredicate<(all_of FeatureGFX13Insts)>;
 
-def HasAtomicFMinFMaxF32GlobalInsts :
-  Predicate<"Subtarget->hasAtomicFMinFMaxF32GlobalInsts()">,
-  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32GlobalInsts)>;
-
-def HasAtomicFMinFMaxF64GlobalInsts :
-  Predicate<"Subtarget->hasAtomicFMinFMaxF64GlobalInsts()">,
-  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64GlobalInsts)>;
-
-def HasAtomicFMinFMaxF32FlatInsts :
-  Predicate<"Subtarget->hasAtomicFMinFMaxF32FlatInsts()">,
-  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF32FlatInsts)>;
-
-def HasAtomicFMinFMaxF64FlatInsts :
-  Predicate<"Subtarget->hasAtomicFMinFMaxF64FlatInsts()">,
-  AssemblerPredicate<(any_of FeatureAtomicFMinFMaxF64FlatInsts)>;
+def HasAtomicCondSubClampFlatInsts :
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
+  AssemblerPredicate<(all_of FeatureGFX12Insts)>;
 
 def HasLdsAtomicAddF64 :
   Predicate<"Subtarget->hasLdsAtomicAddF64()">,
   AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>;
 
-def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
-  AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
-def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
-  AssemblerPredicate<(all_of FeatureFlatScratchInsts)>;
-def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">,
-  AssemblerPredicate<(all_of FeatureScalarFlatScratchInsts)>;
 def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
   AssemblerPredicate<(all_of FeatureGFX9Insts)>;
 
@@ -2519,24 +2453,17 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
 def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
   AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
 
-def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
-  AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
-
 def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
   AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
 
 def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
   AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
 
-def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
-  AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>;
 def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
   AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
 
-def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
-  AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
 def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
-  AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;
+  AssemblerPredicate<(all_of (not FeatureRestrictedSOffset))>;
 
 def D16PreservesUnusedBits :
   Predicate<"Subtarget->d16PreservesUnusedBits()">,
@@ -2552,7 +2479,7 @@ def HasFormattedMUBUFInsts : Predicate<"Subtarget->hasFormattedMUBUFInsts()">,
   AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>;
 
 def HasExportInsts : Predicate<"Subtarget->hasExportInsts()">,
-  AssemblerPredicate<(all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts))>;
+  AssemblerPredicate<(any_of FeatureGFX13Insts, (all_of (not FeatureGFX90AInsts), (not FeatureGFX1250Insts)))>;
 
 def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">,
   AssemblerPredicate<(all_of FeatureGFX11Insts, (not FeatureGFX1250Insts))>;
@@ -2563,18 +2490,10 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9
 def HasLDSFPAtomicAddF32 : Predicate<"Subtarget->hasLDSFPAtomicAddF32()">,
   AssemblerPredicate<(all_of FeatureGFX8Insts)>;
 
-def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
-  AssemblerPredicate<(all_of FeatureAddNoCarryInsts)>;
-
-def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">;
 
 def HasXNACKEnabled : Predicate<"Subtarget->isXNACKEnabled()">;
 
-def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
-  AssemblerPredicate<(all_of Feature16BitInsts)>;
-
-def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
-  AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
 def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()">,
   AssemblerPredicate<(all_of (not FeatureTrue16BitInsts))>;
 
@@ -2588,30 +2507,14 @@ def NotUseRealTrue16Insts : True16PredicateClass<"!Subtarget->useRealTrue16Insts
   AssemblerPredicate<(not (all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts))>;
 def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && "
                                               "!Subtarget->useRealTrue16Insts()">,
-  AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
-  // FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
-  // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
 
 def UseTrue16WithSramECC : True16PredicateClass<"Subtarget->useRealTrue16Insts() && "
                                                 "!Subtarget->d16PreservesUnusedBits()">;
 
-def HasD16Writes32BitVgpr: Predicate<"Subtarget->hasD16Writes32BitVgpr()">,
-  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureD16Writes32BitVgpr)>;
 def NotHasD16Writes32BitVgpr: Predicate<"!Subtarget->hasD16Writes32BitVgpr()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not FeatureD16Writes32BitVgpr))>;
 
-def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
-  AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
-
-def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
-  AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
-
-def HasBF16PackedInsts : Predicate<"Subtarget->hasBF16PackedInsts()">,
-  AssemblerPredicate<(all_of FeatureBF16PackedInsts)>;
-
-def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
-  AssemblerPredicate<(all_of FeatureVOP3P)>;
-
 def NotHasMed3_16 : Predicate<"!Subtarget->hasMed3_16()">;
 def HasMed3_16 : Predicate<"Subtarget->hasMed3_16()">;
 
@@ -2620,8 +2523,6 @@ def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()
 
 def HasFminFmaxLegacy : Predicate<"Subtarget->hasFminFmaxLegacy()">;
 
-def HasSDWA : Predicate<"Subtarget->hasSDWA()">;
-
 def HasSDWA8 : Predicate<"Subtarget->hasSDWA()">,
   AssemblerPredicate<(all_of (not FeatureGFX9Insts), FeatureSDWA)>;
 
@@ -2639,12 +2540,6 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">,
 def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
   AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
 
-def HasDPALU_DPP : Predicate<"Subtarget->hasDPALU_DPP()">,
-  AssemblerPredicate<(all_of FeatureDPALU_DPP)>;
-
-def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
-  AssemblerPredicate<(all_of FeaturePackedFP32Ops)>;
-
 def HasPkMovB32 : Predicate<"Subtarget->hasPkMovB32()">,
   AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
 
@@ -2656,14 +2551,6 @@ def HasFmaakFmamkF64Insts :
   Predicate<"Subtarget->hasFmaakFmamkF64Insts()">,
   AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
 
-def HasAddMinMaxInsts :
-  Predicate<"Subtarget->hasAddMinMaxInsts()">,
-  AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
-
-def HasPkAddMinMaxInsts :
-  Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
-  AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
-
 def HasPkMinMax3Insts :
   Predicate<"Subtarget->hasPkMinMax3Insts()">,
   AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
@@ -2672,295 +2559,92 @@ def HasSGetShaderCyclesInst :
   Predicate<"Subtarget->hasSGetShaderCyclesInst()">,
   AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
 
-def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">,
-  AssemblerPredicate<(all_of FeatureImageInsts)>;
-
-def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">,
-  AssemblerPredicate<(all_of FeatureExtendedImageInsts)>;
-
-def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
-  AssemblerPredicate<(all_of FeatureR128A16)>;
-
-def HasA16 : Predicate<"Subtarget->hasA16()">,
-  AssemblerPredicate<(all_of FeatureA16)>;
-
-def HasG16 : Predicate<"Subtarget->hasG16()">,
-  AssemblerPredicate<(all_of FeatureG16)>;
-
 def HasDPP16 : Predicate<"Subtarget->hasDPP()">,
   AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>;
 
-def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
-  AssemblerPredicate<(all_of FeatureIntClamp)>;
-
-def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
-  AssemblerPredicate<(all_of FeatureMadMixInsts)>;
-
-def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">,
-  AssemblerPredicate<(all_of FeatureScalarStores)>;
-
-def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
-  AssemblerPredicate<(all_of FeatureScalarAtomics)>;
-
-def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">,
-  AssemblerPredicate<(all_of FeatureNoSdstCMPX)>;
-
 def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">,
   AssemblerPredicate<(all_of (not FeatureNoSdstCMPX))>;
 
 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
-def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
-                      AssemblerPredicate<(all_of FeatureVGPRIndexMode)>;
-def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
-                AssemblerPredicate<(all_of FeatureMovrel)>;
-
-def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
-  AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
-
-def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
-  AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
-
-def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
-  AssemblerPredicate<(all_of FeatureDLInsts)>;
-
-def HasFmacF64Inst : Predicate<"Subtarget->hasFmacF64Inst()">,
-  AssemblerPredicate<(all_of FeatureFmacF64Inst)>;
-
-def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">,
-  AssemblerPredicate<(all_of FeatureDot1Insts)>;
-
-def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">,
-  AssemblerPredicate<(all_of FeatureDot2Insts)>;
-
-def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">,
-  AssemblerPredicate<(all_of FeatureDot3Insts)>;
-
-def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">,
-  AssemblerPredicate<(all_of FeatureDot4Insts)>;
-
-def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
-  AssemblerPredicate<(all_of FeatureDot5Insts)>;
-
-def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
-  AssemblerPredicate<(all_of FeatureDot6Insts)>;
-
-def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">,
-  AssemblerPredicate<(all_of FeatureDot7Insts)>;
-
-def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">,
-  AssemblerPredicate<(all_of FeatureDot8Insts)>;
-
-def HasDot9Insts : Predicate<"Subtarget->hasDot9Insts()">,
-  AssemblerPredicate<(all_of FeatureDot9Insts)>;
-
-def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">,
-  AssemblerPredicate<(all_of FeatureDot10Insts)>;
-
-def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">,
-  AssemblerPredicate<(all_of FeatureDot11Insts)>;
-
-def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">,
-  AssemblerPredicate<(all_of FeatureDot12Insts)>;
-
-def HasDot13Insts : Predicate<"Subtarget->hasDot13Insts()">,
-  AssemblerPredicate<(all_of FeatureDot13Insts)>;
-
-def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
-  AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
-
-def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
-  AssemblerPredicate<(all_of FeatureMAIInsts)>;
 
 def NotHasMAIInsts : Predicate<"!Subtarget->hasMAIInsts()">,
   AssemblerPredicate<(all_of (not FeatureMAIInsts))>;
 
-def HasSMemRealTime : Predicate<"Subtarget->hasSMemRealTime()">,
-  AssemblerPredicate<(all_of FeatureSMemRealTime)>;
-
-def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
-  AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
-
-def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
-  AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
-
-def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">;
-
-def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
-  AssemblerPredicate<(all_of FeatureFP8Insts)>;
-
-def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">,
-  AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>;
-
-def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">,
-  AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>;
-
 def NotHasFP8E5M3Insts : Predicate<"!Subtarget->hasFP8E5M3Insts()">,
   AssemblerPredicate<(all_of (not FeatureFP8E5M3Insts))>;
 
-def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
-  AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
-
-def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
-  AssemblerPredicate<(all_of FeatureMadMacF32Insts)>;
-
 def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
   AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
 
-def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">,
-  AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>;
+def HasFmacLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts() && Subtarget->getGeneration() < AMDGPUSubtarget::GFX12">,
+  AssemblerPredicate<(all_of FeatureGFX10_3Insts, (not FeatureGFX12Insts))>;
 
-def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">,
-  AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>;
+def HasAtomicDsCondSubClampInsts :
+  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
+  AssemblerPredicate<(all_of FeatureGFX12Insts)>;
 
-def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">,
-  AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>;
-def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">,
-  AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>;
 def HasAtomicBufferGlobalPkAddF16NoRtnInsts
   : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() || Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">,
   AssemblerPredicate<(any_of FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts)>;
-def HasAtomicBufferGlobalPkAddF16Insts
-  : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">,
-  AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>;
-def HasAtomicGlobalPkAddBF16Inst
-  : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">,
-    AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
-def HasAtomicBufferPkAddBF16Inst
-  : Predicate<"Subtarget->hasAtomicBufferPkAddBF16Inst()">,
-    AssemblerPredicate<(all_of FeatureAtomicBufferPkAddBF16Inst)>;
-def HasFlatAtomicFaddF32Inst
-  : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
-  AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
-
-def HasDefaultComponentZero
-  : Predicate<"Subtarget->hasDefaultComponentZero()">,
-  AssemblerPredicate<(all_of FeatureDefaultComponentZero)>;
-def HasDefaultComponentBroadcast
-  : Predicate<"Subtarget->hasDefaultComponentBroadcast()">,
-  AssemblerPredicate<(all_of FeatureDefaultComponentBroadcast)>;
-
-def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
-  AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
 
 def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">,
   AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
 
-def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
-
-def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
-
-def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">,
-  AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>;
+def HasFlatScratchEnabled : Predicate<"Subtarget->hasFlatScratchEnabled()">;
 
-def HasMADIntraFwdBug : Predicate<"Subtarget->hasMADIntraFwdBug()">;
+def NotHasFlatScratchEnabled : Predicate<"!Subtarget->hasFlatScratchEnabled()">;
 
-def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
-
-def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
-  AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;
+def NotHasMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
 
 def NotHasSALUFloatInsts : Predicate<"!Subtarget->hasSALUFloatInsts()">,
   AssemblerPredicate<(all_of (not FeatureSALUFloatInsts))>;
 
-def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
-  AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
-
-def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
-  AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
-
-def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
-  AssemblerPredicate<(all_of FeatureTanhInsts)>;
-
-def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">,
-  AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>;
-
-def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
-  AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
-
-def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
-  AssemblerPredicate<(all_of FeaturePrngInst)>;
+def NotHasCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
 
-def HasBVHDualAndBVH8Insts : Predicate<"Subtarget->hasBVHDualAndBVH8Insts()">,
-  AssemblerPredicate<(all_of FeatureBVHDualAndBVH8Insts)>;
-
-def Has64BitLiterals : Predicate<"Subtarget->has64BitLiterals()">,
-  AssemblerPredicate<(all_of Feature64BitLiterals)>;
-
-def Has1024AddressableVGPRs : Predicate<"Subtarget->has1024AddressableVGPRs()">,
-  AssemblerPredicate<(all_of Feature1024AddressableVGPRs)>;
-
-def HasWaitXcnt : Predicate<"Subtarget->hasWaitXcnt()">,
-  AssemblerPredicate<(all_of FeatureWaitXcnt)>;
-
-def HasFP8ConversionScaleInsts : Predicate<"Subtarget->hasFP8ConversionScaleInsts()">,
-  AssemblerPredicate<(all_of FeatureFP8ConversionScaleInsts)>;
-
-def HasBF8ConversionScaleInsts : Predicate<"Subtarget->hasBF8ConversionScaleInsts()">,
-  AssemblerPredicate<(all_of FeatureBF8ConversionScaleInsts)>;
-
-def HasFP4ConversionScaleInsts : Predicate<"Subtarget->hasFP4ConversionScaleInsts()">,
-  AssemblerPredicate<(all_of FeatureFP4ConversionScaleInsts)>;
-
-def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionScaleInsts()">,
-  AssemblerPredicate<(all_of FeatureFP6BF6ConversionScaleInsts)>;
-
-def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
-  AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;
-
-def HasCvtPkF16F32Inst : Predicate<"Subtarget->hasCvtPkF16F32Inst()">,
-  AssemblerPredicate<(all_of FeatureCvtPkF16F32Inst)>;
-
-def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">,
-  AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>;
-
-def HasGDS : Predicate<"Subtarget->hasGDS()">;
-
-def HasGWS : Predicate<"Subtarget->hasGWS()">;
-
-def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">;
-def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
-
-def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
-
-def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
-
-def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
-   AssemblerPredicate<(all_of FeatureXF32Insts)>;
-
-def HasVmemPrefInsts : Predicate<"Subtarget->hasVmemPrefInsts()">,
-  AssemblerPredicate<(all_of FeatureVmemPrefInsts)>;
-
-def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
-  AssemblerPredicate<(all_of FeatureAshrPkInsts)>;
-
-def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
-                        AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
-
-def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">,
-                        AssemblerPredicate<(all_of FeatureAddSubU64Insts)>;
-
-def HasMadU32Inst : Predicate<"Subtarget->hasMadU32Inst()">,
-                    AssemblerPredicate<(all_of FeatureMadU32Inst)>;
+def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
+                      AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;
 
-def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
-  AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
+def NotNeedsAlignedVGPRs : Predicate<"!Subtarget->needsAlignedVGPRs()">,
+                      AssemblerPredicate<(all_of (not FeatureRequiresAlignedVGPRs))>;
 
-def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
- AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>;
+def isWave32 : Predicate<"Subtarget->isWave32()">,
+  AssemblerPredicate <(any_of FeatureWavefrontSize32,
+                              FeatureAssemblerPermissiveWavesize)>;
+def isWave64 : Predicate<"Subtarget->isWave64()">,
+  AssemblerPredicate <(any_of FeatureWavefrontSize64,
+                              FeatureAssemblerPermissiveWavesize)>;
 
-def NeedsAlignedVGPRs : Predicate<"Subtarget->needsAlignedVGPRs()">,
-                      AssemblerPredicate<(all_of FeatureRequiresAlignedVGPRs)>;
+def isWave32Strict : Predicate<"Subtarget->isWave32()">,
+  AssemblerPredicate <(all_of FeatureWavefrontSize32)>;
+def isWave64Strict : Predicate<"Subtarget->isWave64()">,
+  AssemblerPredicate <(all_of FeatureWavefrontSize64)>;
 
 //===----------------------------------------------------------------------===//
 // HwModes
 //===----------------------------------------------------------------------===//
 
-// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement
+defvar DefaultMode_Wave64 = DefaultMode;
+defvar DefaultMode_Wave32 = HwMode<[isWave32, NotNeedsAlignedVGPRs]>;
+
+// gfx90a-gfx950. Has AGPRs, and also the align2 VGPR/AGPR requirement. Implied
+// wave64.
 def AVAlign2LoadStoreMode : HwMode<[HasMAIInsts, NeedsAlignedVGPRs]>;
 
 // gfx1250, has alignment requirement but no AGPRs.
-def AlignedVGPRNoAGPRMode : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs]>;
+def AlignedVGPRNoAGPRMode_Wave32 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave32Strict]>;
+def AlignedVGPRNoAGPRMode_Wave64 : HwMode<[NotHasMAIInsts, NeedsAlignedVGPRs, isWave64Strict]>;
+
+// FIXME: This should be able to only define a separate hwmode that
+// only depends on wavesize for just ValueTypes. These use different
+// HwMode namespaces. If we don't define the full set of modes used
+// for RegClassByHwMode, tablegen crashes for some reason
+def WaveSizeVT : ValueTypeByHwMode<[
+  DefaultMode_Wave64,
+  AVAlign2LoadStoreMode,
+  AlignedVGPRNoAGPRMode_Wave64,
+  DefaultMode_Wave32,
+  AlignedVGPRNoAGPRMode_Wave32], [i64, i64, i64, i32, i32]>;
 
 
 // Include AMDGPU TD files
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index c28c25f..2bdadda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -65,7 +65,7 @@ recursivelyVisitUsers(GlobalValue &GV,
       continue;
 
     if (Instruction *I = dyn_cast<Instruction>(U)) {
-      Function *F = I->getParent()->getParent();
+      Function *F = I->getFunction();
       if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
         // FIXME: This is a horrible hack. We should always respect noinline,
         // and just let us hit the error when we can't handle this.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index dda8033..346e257 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-argument-reg-usage-info"
 
-INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE,
+INITIALIZE_PASS(AMDGPUArgumentUsageInfoWrapperLegacy, DEBUG_TYPE,
                 "Argument Register Usage Information Storage", false, true)
 
 void ArgDescriptor::print(raw_ostream &OS,
@@ -42,7 +42,7 @@ void ArgDescriptor::print(raw_ostream &OS,
   OS << '\n';
 }
 
-char AMDGPUArgumentUsageInfo::ID = 0;
+char AMDGPUArgumentUsageInfoWrapperLegacy::ID = 0;
 
 const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
 
@@ -50,15 +50,6 @@ const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
 const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
   = AMDGPUFunctionArgInfo::fixedABILayout();
 
-bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) {
-  return false;
-}
-
-bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
-  ArgInfoMap.clear();
-  return false;
-}
-
 // TODO: Print preload kernargs?
 void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
   for (const auto &FI : ArgInfoMap) {
@@ -86,6 +77,12 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
   }
 }
 
+bool AMDGPUArgumentUsageInfo::invalidate(Module &M, const PreservedAnalyses &PA,
+                                         ModuleAnalysisManager::Invalidator &) {
+  auto PAC = PA.getChecker<AMDGPUArgumentUsageAnalysis>();
+  return !PAC.preservedWhenStateless();
+}
+
 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
 AMDGPUFunctionArgInfo::getPreloadedValue(
     AMDGPUFunctionArgInfo::PreloadedValue Value) const {
@@ -191,3 +188,10 @@ AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
     return FixedABIFunctionInfo;
   return I->second;
 }
+
+AnalysisKey AMDGPUArgumentUsageAnalysis::Key;
+
+AMDGPUArgumentUsageInfo
+AMDGPUArgumentUsageAnalysis::run(Module &M, ModuleAnalysisManager &) {
+  return AMDGPUArgumentUsageInfo();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 1064e57..f38e49b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -12,7 +12,10 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/Register.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include <variant>
 
 namespace llvm {
 
@@ -27,55 +30,44 @@ private:
   friend struct AMDGPUFunctionArgInfo;
   friend class AMDGPUArgumentUsageInfo;
 
-  union {
-    MCRegister Reg;
-    unsigned StackOffset;
-  };
+  std::variant<std::monostate, MCRegister, unsigned> Val;
 
   // Bitmask to locate argument within the register.
   unsigned Mask;
 
-  bool IsStack : 1;
-  bool IsSet : 1;
-
 public:
-  ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false,
-                bool IsSet = false)
-      : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+  ArgDescriptor(unsigned Mask = ~0u) : Mask(Mask) {}
 
   static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
-    return ArgDescriptor(Reg, Mask, false, true);
+    ArgDescriptor Ret(Mask);
+    Ret.Val = Reg.asMCReg();
+    return Ret;
   }
 
   static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
-    return ArgDescriptor(Offset, Mask, true, true);
+    ArgDescriptor Ret(Mask);
+    Ret.Val = Offset;
+    return Ret;
   }
 
   static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
-    return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
+    // Copy the descriptor, then change the mask.
+    ArgDescriptor Ret(Arg);
+    Ret.Mask = Mask;
+    return Ret;
   }
 
-  bool isSet() const {
-    return IsSet;
-  }
+  bool isSet() const { return !std::holds_alternative<std::monostate>(Val); }
 
   explicit operator bool() const {
     return isSet();
   }
 
-  bool isRegister() const {
-    return !IsStack;
-  }
+  bool isRegister() const { return std::holds_alternative<MCRegister>(Val); }
 
-  MCRegister getRegister() const {
-    assert(!IsStack);
-    return Reg;
-  }
+  MCRegister getRegister() const { return std::get<MCRegister>(Val); }
 
-  unsigned getStackOffset() const {
-    assert(IsStack);
-    return StackOffset;
-  }
+  unsigned getStackOffset() const { return std::get<unsigned>(Val); }
 
   unsigned getMask() const {
     // None of the target SGPRs or VGPRs are expected to have a 'zero' mask.
@@ -96,7 +88,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
 }
 
 struct KernArgPreloadDescriptor : public ArgDescriptor {
-  KernArgPreloadDescriptor() {}
+  KernArgPreloadDescriptor() = default;
   SmallVector<MCRegister> Regs;
 };
 
@@ -178,32 +170,67 @@ struct AMDGPUFunctionArgInfo {
   static AMDGPUFunctionArgInfo fixedABILayout();
 };
 
-class AMDGPUArgumentUsageInfo : public ImmutablePass {
+class AMDGPUArgumentUsageInfo {
 private:
   DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
 
 public:
-  static char ID;
-
   static const AMDGPUFunctionArgInfo ExternFunctionInfo;
   static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
 
-  AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
+  void print(raw_ostream &OS, const Module *M = nullptr) const;
+
+  void clear() { ArgInfoMap.clear(); }
+
+  void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
+    ArgInfoMap[&F] = ArgInfo;
+  }
+
+  const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
+
+  bool invalidate(Module &M, const PreservedAnalyses &PA,
+                  ModuleAnalysisManager::Invalidator &Inv);
+};
+
+class AMDGPUArgumentUsageInfoWrapperLegacy : public ImmutablePass {
+  std::unique_ptr<AMDGPUArgumentUsageInfo> AUIP;
+
+public:
+  static char ID;
+
+  AMDGPUArgumentUsageInfoWrapperLegacy() : ImmutablePass(ID) {}
+
+  AMDGPUArgumentUsageInfo &getArgUsageInfo() { return *AUIP; }
+  const AMDGPUArgumentUsageInfo &getArgUsageInfo() const { return *AUIP; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
 
-  bool doInitialization(Module &M) override;
-  bool doFinalization(Module &M) override;
+  bool doInitialization(Module &M) override {
+    AUIP = std::make_unique<AMDGPUArgumentUsageInfo>();
+    return false;
+  }
 
-  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+  bool doFinalization(Module &M) override {
+    AUIP->clear();
+    return false;
+  }
 
-  void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
-    ArgInfoMap[&F] = ArgInfo;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override {
+    AUIP->print(OS, M);
   }
+};
 
-  const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
+class AMDGPUArgumentUsageAnalysis
+    : public AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis> {
+  friend AnalysisInfoMixin<AMDGPUArgumentUsageAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = AMDGPUArgumentUsageInfo;
+
+  AMDGPUArgumentUsageInfo run(Module &M, ModuleAnalysisManager &);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
index 19e2a6a..9af3b05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
@@ -208,7 +208,8 @@ void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
   Value *NumBits = IRB.CreateTypeSize(IntptrTy, TypeStoreSize);
   Value *Size = IRB.CreateLShr(NumBits, ConstantInt::get(IntptrTy, 3));
   Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy);
-  Value *SizeMinusOne = IRB.CreateAdd(Size, ConstantInt::get(IntptrTy, -1));
+  Value *SizeMinusOne =
+      IRB.CreateAdd(Size, ConstantInt::getAllOnesValue(IntptrTy));
   Value *LastByte =
       IRB.CreateIntToPtr(IRB.CreateAdd(AddrLong, SizeMinusOne), AddrTy);
   instrumentAddressImpl(M, IRB, OrigIns, InsertBefore, Addr, {}, 8, IsWrite,
@@ -244,11 +245,8 @@ void getInterestingMemoryOperands(
       // Masked store has an initial operand for the value.
       unsigned OpOffset = IsWrite ? 1 : 0;
       Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
-      MaybeAlign Alignment = Align(1);
-      // Otherwise no alignment guarantees. We probably got Undef.
-      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
-        Alignment = Op->getMaybeAlignValue();
-      Value *Mask = CI->getOperand(2 + OpOffset);
+      MaybeAlign Alignment = CI->getParamAlign(OpOffset);
+      Value *Mask = CI->getOperand(1 + OpOffset);
       Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
       break;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 29f8f9b..7d2df427 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -323,7 +323,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
                          "' is already defined");
 
     const DataLayout &DL = GV->getDataLayout();
-    uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+    uint64_t Size = GV->getGlobalSize(DL);
     Align Alignment = GV->getAlign().value_or(Align(4));
 
     emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
@@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
   return AsmPrinter::doInitialization(M);
 }
 
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+                                    const MCExpr *NumVGPRs,
+                                    unsigned DynamicVGPRBlockSize,
+                                    const GCNSubtarget &STM, MCContext &Ctx) {
+  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
+  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+  unsigned Generation = STM.getGeneration();
+
+  auto CreateExpr = [&Ctx](unsigned Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy,
+                              {CreateExpr(MaxWaves), CreateExpr(Granule),
+                               CreateExpr(TargetTotalNumVGPRs),
+                               CreateExpr(Generation), CreateExpr(InitOcc),
+                               NumSGPRs, NumVGPRs},
+                              Ctx);
+}
+
 void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
   if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
     return;
@@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
                         MaxWaves, MFI.getDynamicVGPRBlockSize())});
       uint64_t NumSGPRsForWavesPerEU = std::max(
           {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
-      const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+      const MCExpr *OccupancyExpr = createOccupancy(
           STM.getOccupancyWithWorkGroupSizes(*MF).second,
           MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
           MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
@@ -508,9 +534,9 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
   MCSectionELF *MaxGPRSection =
       OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
   OutStreamer->switchSection(MaxGPRSection);
-  getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
-                                              RI.getMaxAGPRSymbol(OutContext),
-                                              RI.getMaxSGPRSymbol(OutContext));
+  getTargetStreamer()->EmitMCResourceMaximums(
+      RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
+      RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
   OutStreamer->popSection();
 
   for (Function &F : M.functions())
@@ -634,7 +660,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
   (void)PGRM_Rsrc3;
   (void)EvaluatableRsrc3;
   assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
-         STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
+         STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
          static_cast<uint64_t>(PGRM_Rsrc3) == 0);
   KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
 
@@ -805,7 +831,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
           " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
     }
 
-    if (AMDGPU::isGFX1250(STM))
+    if (STM.hasGFX1250Insts())
       OutStreamer->emitRawComment(
           " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
           false);
@@ -841,7 +867,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
     [[maybe_unused]] int64_t PGMRSrc3;
     assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
-           STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
+           STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
            (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
             static_cast<uint64_t>(PGMRSrc3) == 0));
     if (STM.hasGFX90AInsts()) {
@@ -1160,21 +1186,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // Make clamp modifier on NaN input returns 0.
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
-  unsigned LDSAlignShift;
-  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
-    // LDS is allocated in 256 dword blocks.
-    LDSAlignShift = 10;
-  } else if (STM.getFeatureBits().test(
-                 FeatureAddressableLocalMemorySize163840)) {
-    // LDS is allocated in 320 dword blocks.
+  unsigned LDSAlignShift = 8;
+  switch (getLdsDwGranularity(STM)) {
+  case 512:
+  case 320:
     LDSAlignShift = 11;
-  } else if (STM.getFeatureBits().test(
-                 FeatureAddressableLocalMemorySize65536)) {
-    // LDS is allocated in 128 dword blocks.
+    break;
+  case 128:
     LDSAlignShift = 9;
-  } else {
-    // LDS is allocated in 64 dword blocks.
+    break;
+  case 64:
     LDSAlignShift = 8;
+    break;
+  default:
+    llvm_unreachable("invald LDS block size");
   }
 
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
@@ -1230,8 +1255,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
   // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
-  ProgInfo.TrapHandlerEnable =
-      STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
+  ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
   ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
   ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
   ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
@@ -1264,13 +1288,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
-  if (AMDGPU::isGFX1250(STM))
+  if (STM.hasGFX1250Insts())
     ProgInfo.ComputePGMRSrc3 =
         SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
 
-  ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
+  ProgInfo.Occupancy = createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
       MFI->getDynamicVGPRBlockSize(), STM, Ctx);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 0a163f8..784ee36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -589,7 +589,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
   // return the next active lane
   auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
 
-  auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
+  auto *InverseMask = B.CreateXor(Mask, ConstantInt::getAllOnesValue(WaveTy));
   auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
   ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9907c88f..b86a4ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -38,9 +38,10 @@ enum ImplicitArgumentPositions {
 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
 
 enum ImplicitArgumentMask {
-  NOT_IMPLICIT_INPUT = 0,
+  UNKNOWN_INTRINSIC = 0,
 #include "AMDGPUAttributes.def"
-  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+  NOT_IMPLICIT_INPUT
 };
 
 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
     return QUEUE_PTR;
   default:
-    return NOT_IMPLICIT_INPUT;
+    return UNKNOWN_INTRINSIC;
   }
 }
 
@@ -200,16 +201,6 @@ public:
   /// Get code object version.
   unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
 
-  /// Get the effective value of "amdgpu-waves-per-eu" for the function,
-  /// accounting for the interaction with the passed value to use for
-  /// "amdgpu-flat-work-group-size".
-  std::pair<unsigned, unsigned>
-  getWavesPerEU(const Function &F,
-                std::pair<unsigned, unsigned> FlatWorkGroupSize) {
-    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-    return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
-  }
-
   std::optional<std::pair<unsigned, unsigned>>
   getWavesPerEUAttr(const Function &F) {
     auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
@@ -223,15 +214,6 @@ public:
     return std::make_pair(Val->first, *(Val->second));
   }
 
-  std::pair<unsigned, unsigned>
-  getEffectiveWavesPerEU(const Function &F,
-                         std::pair<unsigned, unsigned> WavesPerEU,
-                         std::pair<unsigned, unsigned> FlatWorkGroupSize) {
-    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-    return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
-                                     getLDSSize(F));
-  }
-
   unsigned getMaxWavesPerEU(const Function &F) {
     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
     return ST.getMaxWavesPerEU();
@@ -258,14 +240,6 @@ private:
     return Status;
   }
 
-  /// Returns the minimum amount of LDS space used by a workgroup running
-  /// function \p F.
-  static unsigned getLDSSize(const Function &F) {
-    return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
-                                           {0, UINT32_MAX}, true)
-        .first;
-  }
-
   /// Get the constant access bitmap for \p C.
   uint8_t getConstantAccess(const Constant *C,
                             SmallPtrSetImpl<const Constant *> &Visited) {
@@ -534,6 +508,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       ImplicitArgumentMask AttrMask =
           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
                               HasApertureRegs, SupportsGetDoorbellID, COV);
+
+      if (AttrMask == UNKNOWN_INTRINSIC) {
+        // Assume not-nocallback intrinsics may invoke a function which accesses
+        // implicit arguments.
+        //
+        // FIXME: This isn't really the correct check. We want to ensure it
+        // isn't calling any function that may use implicit arguments regardless
+        // of whether it's internal to the module or not.
+        //
+        // TODO: Ignoring callsite attributes.
+        if (!Callee->hasFnAttribute(Attribute::NoCallback))
+          return indicatePessimisticFixpoint();
+        continue;
+      }
+
       if (AttrMask != NOT_IMPLICIT_INPUT) {
         if ((IsNonEntryFunc || !NonKernelOnly))
           removeAssumedBits(AttrMask);
@@ -1336,7 +1325,6 @@ struct AAAMDGPUMinAGPRAlloc
         Maximum.takeAssumedMaximum(NumRegs);
         return true;
       }
-
       switch (CB.getIntrinsicID()) {
       case Intrinsic::not_intrinsic:
         break;
@@ -1354,10 +1342,24 @@ struct AAAMDGPUMinAGPRAlloc
 
         return true;
       }
+      // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
+      // the nocallback attribute, so the AMDGPU attributor can conservatively
+      // drop all implicitly-known inputs and AGPR allocation information. Make
+      // sure we still infer that no implicit inputs are required and that the
+      // AGPR allocation stays at zero. Trap-like intrinsics may invoke a
+      // function which requires AGPRs, so we need to check if the called
+      // function has the "trap-func-name" attribute.
+      case Intrinsic::trap:
+      case Intrinsic::debugtrap:
+      case Intrinsic::ubsantrap:
+        return CB.hasFnAttr(Attribute::NoCallback) ||
+               !CB.hasFnAttr("trap-func-name");
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
         // required to use AGPRs.
-        return true;
+        // Assume !nocallback intrinsics may call a function which requires
+        // AGPRs.
+        return CB.hasFnAttr(Attribute::NoCallback);
       }
 
       // TODO: Handle callsite attributes
@@ -1555,7 +1557,7 @@ private:
 
   AMDGPU::ClusterDimsAttr Attr;
 
-  static constexpr const char AttrName[] = "amdgpu-cluster-dims";
+  static constexpr char AttrName[] = "amdgpu-cluster-dims";
 };
 
 AAAMDGPUClusterDims &
@@ -1584,7 +1586,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
        &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
        &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
        &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
-       &AAAMDGPUClusterDims::ID});
+       &AAAMDGPUClusterDims::ID, &AAAlign::ID});
 
   AttributorConfig AC(CGUpdater);
   AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1642,6 +1644,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
       if (Ptr) {
         A.getOrCreateAAFor<AAAddressSpace>(IRPosition::value(*Ptr));
         A.getOrCreateAAFor<AANoAliasAddrSpace>(IRPosition::value(*Ptr));
+        if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
+          if (II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
+            A.getOrCreateAAFor<AAAlign>(IRPosition::value(*Ptr));
+        }
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
new file mode 100644
index 0000000..c9fcec8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -0,0 +1,120 @@
+//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to add latency to:
+///       1. Barrier edges between ATOMIC_FENCE instructions and preceding
+///          memory accesses potentially affected by the fence.
+///          This encourages the scheduling of more instructions before
+///          ATOMIC_FENCE instructions.  ATOMIC_FENCE instructions may
+///          introduce wait counting or indicate an impending S_BARRIER
+///          wait.  Having more instructions in-flight across these
+///          constructs improves latency hiding.
+///       2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT.
+///          This encourages independent work to be scheduled between
+///          signal and wait, hiding barrier synchronization latency.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUBarrierLatency.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> BarrierSignalWaitLatencyOpt(
+    "amdgpu-barrier-signal-wait-latency",
+    cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "
+             "to encourage scheduling independent work between them"),
+    cl::init(16), cl::Hidden);
+
+namespace {
+
+class BarrierLatency : public ScheduleDAGMutation {
+private:
+  SmallSet<SyncScope::ID, 4> IgnoredScopes;
+
+public:
+  BarrierLatency(MachineFunction *MF) {
+    LLVMContext &Context = MF->getFunction().getContext();
+    IgnoredScopes.insert(SyncScope::SingleThread);
+    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
+    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
+    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
+
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    if (!ST.requiresWaitOnWorkgroupReleaseFence()) {
+      // Prior to GFX10 workgroup scope does not normally require waitcnts
+      IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup"));
+    }
+  }
+  void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {
+  SUnit *PredSU = PredDep.getSUnit();
+  SDep ForwardD = PredDep;
+  ForwardD.setSUnit(&SU);
+  for (SDep &SuccDep : PredSU->Succs) {
+    if (SuccDep == ForwardD) {
+      SuccDep.setLatency(SuccDep.getLatency() + Latency);
+      break;
+    }
+  }
+  PredDep.setLatency(PredDep.getLatency() + Latency);
+  PredSU->setDepthDirty();
+  SU.setDepthDirty();
+}
+
+void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
+  constexpr unsigned FenceLatency = 2000;
+  const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;
+
+  for (SUnit &SU : DAG->SUnits) {
+    const MachineInstr *MI = SU.getInstr();
+    unsigned Op = MI->getOpcode();
+
+    if (Op == AMDGPU::ATOMIC_FENCE) {
+      // Update latency on barrier edges of ATOMIC_FENCE.
+      // Ignore scopes not expected to have any latency.
+      SyncScope::ID SSID =
+          static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+      if (IgnoredScopes.contains(SSID))
+        continue;
+
+      for (SDep &PredDep : SU.Preds) {
+        if (!PredDep.isBarrier())
+          continue;
+        SUnit *PredSU = PredDep.getSUnit();
+        MachineInstr *MI = PredSU->getInstr();
+        // Only consider memory loads
+        if (!MI->mayLoad() || MI->mayStore())
+          continue;
+        addLatencyToEdge(PredDep, SU, FenceLatency);
+      }
+    } else if (Op == AMDGPU::S_BARRIER_WAIT) {
+      for (SDep &PredDep : SU.Preds) {
+        SUnit *PredSU = PredDep.getSUnit();
+        const MachineInstr *PredMI = PredSU->getInstr();
+        if (TII->isBarrierStart(PredMI->getOpcode())) {
+          addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);
+        }
+      }
+    }
+  }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) {
+  return std::make_unique<BarrierLatency>(MF);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
new file mode 100644
index 0000000..547cd2a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -0,0 +1,24 @@
+//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+class MachineFunction;
+
+std::unique_ptr<ScheduleDAGMutation>
+createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7afadde..5c6affd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
 #define DEBUG_TYPE "amdgpu-call-lowering"
@@ -209,7 +210,7 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
 
     if (!SPReg) {
       const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
-      if (ST.enableFlatScratch()) {
+      if (ST.hasFlatScratchEnabled()) {
         // The stack is accessed unswizzled, so we can use a regular copy.
         SPReg = MIRBuilder.buildCopy(PtrTy,
                                      MFI->getStackPtrOffsetReg()).getReg(0);
@@ -414,12 +415,13 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
   MachineFunction &MF = B.getMF();
   const Function &F = MF.getFunction();
   const DataLayout &DL = F.getDataLayout();
-  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+  const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+  MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF);
 
   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
 
   SmallVector<ArgInfo, 32> SplitArgs;
-  SmallVector<uint64_t> FieldOffsets;
+  SmallVector<TypeSize> FieldOffsets;
   splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv(), &FieldOffsets);
 
   unsigned Idx = 0;
@@ -737,7 +739,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     // For the fixed ABI, pass workitem IDs in the last argument register.
     TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
 
-    if (!Subtarget.enableFlatScratch())
+    if (!Subtarget.hasFlatScratchEnabled())
       CCInfo.AllocateReg(Info->getScratchRSrcReg());
     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
@@ -1196,7 +1198,7 @@ void AMDGPUCallLowering::handleImplicitCallArguments(
     const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
     CallingConv::ID CalleeCC,
     ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
-  if (!ST.enableFlatScratch()) {
+  if (!ST.hasFlatScratchEnabled()) {
     // Insert copies for the SRD. In the HSA case, this should be an identity
     // copy.
     auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index e891fdb..2932bbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -14,6 +14,10 @@
 class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
 class CCIfExtend<CCAction A>
   : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+class CCIfOrigTypeShaderCCIsSGPR<CCAction A>
+  : CCIf<[{(!OrigTy->getScalarType()->isFloatTy() &&
+            !OrigTy->getScalarType()->isHalfTy()) }], A>;
+
 
 // Calling convention for SI
 def CC_SI_Gfx : CallingConv<[
@@ -56,14 +60,15 @@ def CC_SI_SHADER : CallingConv<[
   >>>
 ]>;
 
+
 def RetCC_SI_Shader : CallingConv<[
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
-  CCIfType<[i32, i16, v2i16] , CCAssignToReg<
+  CCIfType<[i32, i16, v2i16], CCIfOrigTypeShaderCCIsSGPR<CCAssignToReg<
     !foreach(i, !range(0, 44), !cast<Register>("SGPR"#i))  // SGPR0-43
-  >>,
+  >>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
-  CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<
+  CCIfType<[f32, f16, v2f16, bf16, v2bf16, i32, i16, v2i16] , CCAssignToReg<
     !foreach(i, !range(0, 136), !cast<Register>("VGPR"#i))  // VGPR0-135
   >>
 ]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba7..e51d2c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/KnownFPClass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/IntegerDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -100,10 +101,9 @@ public:
   const GCNSubtarget &ST;
   const AMDGPUTargetMachine &TM;
   const TargetLibraryInfo *TLI;
-  AssumptionCache *AC;
-  const DominatorTree *DT;
   const UniformityInfo &UA;
   const DataLayout &DL;
+  SimplifyQuery SQ;
   const bool HasFP32DenormalFlush;
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
@@ -115,8 +115,8 @@ public:
   AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
                            const TargetLibraryInfo *TLI, AssumptionCache *AC,
                            const DominatorTree *DT, const UniformityInfo &UA)
-      : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
-        DT(DT), UA(UA), DL(F.getDataLayout()),
+      : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
+        DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
         HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
                              DenormalMode::getPreserveSign()) {}
 
@@ -143,21 +143,14 @@ public:
 
   bool canBreakPHINode(const PHINode &I);
 
-  /// \returns True if binary operation \p I is a signed binary operation, false
-  /// otherwise.
-  bool isSigned(const BinaryOperator &I) const;
-
-  /// \returns True if the condition of 'select' operation \p I comes from a
-  /// signed 'icmp' operation, false otherwise.
-  bool isSigned(const SelectInst &I) const;
-
   /// Return true if \p T is a legal scalar floating point type.
   bool isLegalFloatingTy(const Type *T) const;
 
   /// Wrapper to pass all the arguments to computeKnownFPClass
   KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
                                    const Instruction *CtxI) const {
-    return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT);
+    return llvm::computeKnownFPClass(V, Interested,
+                                     SQ.getWithInstruction(CtxI));
   }
 
   bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
@@ -168,12 +161,12 @@ public:
   /// \returns The minimum number of bits needed to store the value of \Op as an
   /// unsigned integer. Truncating to this size and then zero-extending to
   /// the original will not change the value.
-  unsigned numBitsUnsigned(Value *Op) const;
+  unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
 
   /// \returns The minimum number of bits needed to store the value of \Op as a
   /// signed integer. Truncating to this size and then sign-extending to
   /// the original size will not change the value.
-  unsigned numBitsSigned(Value *Op) const;
+  unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
 
   /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
   /// SelectionDAG has an issue where an and asserting the bits are known
@@ -218,8 +211,7 @@ public:
   Value *matchFractPat(IntrinsicInst &I);
   Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
 
-  bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
-                          FastMathFlags SqrtFMF) const;
+  bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
 
   Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
                          FastMathFlags DivFMF, FastMathFlags SqrtFMF,
@@ -244,6 +236,14 @@ public:
                       FastMathFlags FMF) const;
   Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
                           FastMathFlags FMF) const;
+  Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
+                    FastMathFlags DivFMF, const Instruction *CtxI,
+                    bool IsNegative) const;
+
+  CallInst *createWorkitemIdX(IRBuilder<> &B) const;
+  void replaceWithWorkitemIdX(Instruction &I) const;
+  void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
+  bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
 
   bool tryNarrowMathIfNoOverflow(Instruction *I);
 
@@ -260,6 +260,8 @@ public:
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
   bool visitSqrt(IntrinsicInst &I);
+  bool visitMbcntLo(IntrinsicInst &I) const;
+  bool visitMbcntHi(IntrinsicInst &I) const;
   bool run();
 };
 
@@ -304,16 +306,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
   return MadeChange;
 }
 
-bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
-  return I.getOpcode() == Instruction::AShr ||
-      I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
-}
-
-bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
-  return isa<ICmpInst>(I.getOperand(0)) &&
-         cast<ICmpInst>(I.getOperand(0))->isSigned();
-}
-
 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
   return Ty->isFloatTy() || Ty->isDoubleTy() ||
          (Ty->isHalfTy() && ST.has16BitInsts());
@@ -327,12 +319,16 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
   return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
 }
 
-unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
-  return computeKnownBits(Op, DL, AC).countMaxActiveBits();
+unsigned
+AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
+                                          const Instruction *CtxI) const {
+  return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
 }
 
-unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
-  return ComputeMaxSignificantBits(Op, DL, AC);
+unsigned
+AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
+                                        const Instruction *CtxI) const {
+  return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
 }
 
 static void extractValues(IRBuilder<> &Builder,
@@ -383,12 +379,12 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
   unsigned LHSBits = 0, RHSBits = 0;
   bool IsSigned = false;
 
-  if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
-      (RHSBits = numBitsUnsigned(RHS)) <= 24) {
+  if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
+      (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
     IsSigned = false;
 
-  } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
-             (RHSBits = numBitsSigned(RHS)) <= 24) {
+  } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
+             (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
     IsSigned = true;
 
   } else
@@ -623,15 +619,101 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
   return Builder.CreateFMul(Rsq, OutputScaleFactor);
 }
 
-bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
-                                                  FastMathFlags DivFMF,
-                                                  FastMathFlags SqrtFMF) const {
-  // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
-  if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
-    return false;
+/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
+/// v_rsq_f64. This should give a 1ulp result.
+Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
+                                            FastMathFlags SqrtFMF,
+                                            FastMathFlags DivFMF,
+                                            const Instruction *CtxI,
+                                            bool IsNegative) const {
+  // rsq(x):
+  //   double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
+  //   double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
+  //   return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
+  //
+  // -rsq(x):
+  //   double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
+  //   double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
+  //   return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
+  //
+  // The rsq instruction handles the special cases correctly. We need to check
+  // for the edge case conditions to ensure the special case propagates through
+  // the later instructions.
+
+  Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
+
+  // Try to elide the edge case check.
+  //
+  // Fast math flags imply:
+  //   sqrt ninf => !isinf(x)
+  //   fdiv ninf => x != 0, !isinf(x)
+  bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
+  bool MaybeZero = !DivFMF.noInfs();
+
+  DenormalMode DenormMode;
+  FPClassTest Interested = fcNone;
+  if (MaybePosInf)
+    Interested = fcPosInf;
+  if (MaybeZero)
+    Interested |= fcZero;
+
+  if (Interested != fcNone) {
+    KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
+    if (KnownSrc.isKnownNeverPosInfinity())
+      MaybePosInf = false;
+
+    DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
+    if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
+      MaybeZero = false;
+  }
+
+  Value *SpecialOrRsq = X;
+  if (MaybeZero || MaybePosInf) {
+    Value *Cond;
+    if (MaybePosInf && MaybeZero) {
+      if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
+        FPClassTest TestMask = fcPosInf | fcZero;
+        if (DenormMode.inputsAreZero())
+          TestMask |= fcSubnormal;
 
-  // v_rsq_f32 gives 1ulp
-  return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
+        Cond = Builder.createIsFPClass(X, TestMask);
+      } else {
+        // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
+        // doesn't respect the floating-point environment.
+        Value *IsZero =
+            Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+        Value *IsInf =
+            Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+        Cond = Builder.CreateOr(IsZero, IsInf);
+      }
+    } else if (MaybeZero) {
+      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+    } else {
+      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+    }
+
+    SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
+  }
+
+  Value *NegY0 = Builder.CreateFNeg(Y0);
+  Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
+
+  // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
+  Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
+
+  Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
+
+  Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
+                                  ConstantFP::get(X->getType(), 0.5));
+
+  return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
+}
+
+bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
+                                                  FastMathFlags SqrtFMF) const {
+  // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
+  // f64.
+  return DivFMF.allowContract() && SqrtFMF.allowContract();
 }
 
 Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -647,8 +729,6 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
   if (!CLHS)
     return nullptr;
 
-  assert(Den->getType()->isFloatTy());
-
   bool IsNegative = false;
 
   // TODO: Handle other numerator values with arcp.
@@ -657,14 +737,20 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
     IRBuilder<>::FastMathFlagGuard Guard(Builder);
     Builder.setFastMathFlags(DivFMF | SqrtFMF);
 
-    if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
-        canIgnoreDenormalInput(Den, CtxI)) {
-      Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
-      // -1.0 / sqrt(x) -> fneg(rsq(x))
-      return IsNegative ? Builder.CreateFNeg(Result) : Result;
+    if (Den->getType()->isFloatTy()) {
+      if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
+          canIgnoreDenormalInput(Den, CtxI)) {
+        Value *Result =
+            Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
+        // -1.0 / sqrt(x) -> fneg(rsq(x))
+        return IsNegative ? Builder.CreateFNeg(Result) : Result;
+      }
+
+      return emitRsqIEEE1ULP(Builder, Den, IsNegative);
     }
 
-    return emitRsqIEEE1ULP(Builder, Den, IsNegative);
+    if (Den->getType()->isDoubleTy())
+      return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
   }
 
   return nullptr;
@@ -776,6 +862,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
       return Rsq;
   }
 
+  if (!Num->getType()->isFloatTy())
+    return nullptr;
+
   Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
   if (Rcp)
     return Rcp;
@@ -811,7 +900,8 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
     return false;
 
   Type *Ty = FDiv.getType()->getScalarType();
-  if (!Ty->isFloatTy())
+  const bool IsFloat = Ty->isFloatTy();
+  if (!IsFloat && !Ty->isDoubleTy())
     return false;
 
   // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
@@ -832,10 +922,14 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
       DenII->hasOneUse()) {
     const auto *SqrtOp = cast<FPMathOperator>(DenII);
     SqrtFMF = SqrtOp->getFastMathFlags();
-    if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+    if (canOptimizeWithRsq(DivFMF, SqrtFMF))
       RsqOp = SqrtOp->getOperand(0);
   }
 
+  // rcp path not yet implemented for f64.
+  if (!IsFloat && !RsqOp)
+    return false;
+
   // Inaccurate rcp is allowed with afn.
   //
   // Defer to codegen to handle this.
@@ -850,7 +944,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
     return false;
 
   // Defer the correct implementations to codegen.
-  if (ReqdAccuracy < 1.0f)
+  if (IsFloat && ReqdAccuracy < 1.0f)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
@@ -929,13 +1023,13 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
          Den->getType()->getScalarSizeInBits());
   unsigned SSBits = Num->getType()->getScalarSizeInBits();
   if (IsSigned) {
-    unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I);
+    unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
     // A sign bit needs to be reserved for shrinking.
     unsigned DivBits = SSBits - RHSSignBits + 1;
     if (DivBits > MaxDivBits)
       return SSBits;
 
-    unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I);
+    unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
 
     unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
     DivBits = SSBits - SignBits + 1;
@@ -944,7 +1038,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
 
   // All bits are used for unsigned division for Num or Den in range
   // (SignedMax, UnsignedMax].
-  KnownBits Known = computeKnownBits(Den, DL, AC, &I);
+  KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
   if (Known.isNegative() || !Known.isNonNegative())
     return SSBits;
   unsigned RHSSignBits = Known.countMinLeadingZeros();
@@ -952,7 +1046,7 @@ unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
   if (DivBits > MaxDivBits)
     return SSBits;
 
-  Known = computeKnownBits(Num, DL, AC, &I);
+  Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
   if (Known.isNegative() || !Known.isNonNegative())
     return SSBits;
   unsigned LHSSignBits = Known.countMinLeadingZeros();
@@ -1089,7 +1183,7 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
     // If there's no wider mulhi, there's only a better expansion for powers of
     // two.
     // TODO: Should really know for each vector element.
-    if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT))
+    if (isKnownToBeAPowerOfTwo(C, true, SQ.getWithInstruction(&I)))
       return true;
 
     return false;
@@ -1099,7 +1193,8 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
     // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
     if (BinOpDen->getOpcode() == Instruction::Shl &&
         isa<Constant>(BinOpDen->getOperand(0)) &&
-        isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) {
+        isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
+                               SQ.getWithInstruction(&I))) {
       return true;
     }
   }
@@ -1910,6 +2005,10 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
     return visitFMinLike(I);
   case Intrinsic::sqrt:
     return visitSqrt(I);
+  case Intrinsic::amdgcn_mbcnt_lo:
+    return visitMbcntLo(I);
+  case Intrinsic::amdgcn_mbcnt_hi:
+    return visitMbcntHi(I);
   default:
     return false;
   }
@@ -1984,7 +2083,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
 
   // Match pattern for fract intrinsic in contexts where the nan check has been
   // optimized out (and hope the knowledge the source can't be nan wasn't lost).
-  if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI)))
+  if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
     return false;
 
   IRBuilder<> Builder(&I);
@@ -2090,6 +2189,110 @@ INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
                     false, false)
 
+/// Create a workitem.id.x intrinsic call with range metadata.
+CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
+  CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
+  ST.makeLIDRangeMetadata(Tid);
+  return Tid;
+}
+
+/// Replace the instruction with a direct workitem.id.x call.
+void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
+  IRBuilder<> B(&I);
+  CallInst *Tid = createWorkitemIdX(B);
+  BasicBlock::iterator BI(&I);
+  ReplaceInstWithValue(BI, Tid);
+}
+
+/// Replace the instruction with (workitem.id.x & mask).
+void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
+    Instruction &I, unsigned WaveSize) const {
+  IRBuilder<> B(&I);
+  CallInst *Tid = createWorkitemIdX(B);
+  Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
+  Value *AndInst = B.CreateAnd(Tid, Mask);
+  BasicBlock::iterator BI(&I);
+  ReplaceInstWithValue(BI, AndInst);
+}
+
+/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
+/// work group size allows direct computation of lane ID.
+/// Returns true if optimization was applied, false otherwise.
+bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
+                                                        unsigned Wave) const {
+  std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
+  if (!MaybeX)
+    return false;
+
+  // When work group size == wave_size, each work group contains exactly one
+  // wave, so the instruction can be replaced with workitem.id.x directly.
+  if (*MaybeX == Wave) {
+    replaceWithWorkitemIdX(I);
+    return true;
+  }
+
+  // When work group evenly splits into waves, compute lane ID within wave
+  // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
+  if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
+    replaceWithMaskedWorkitemIdX(I, Wave);
+    return true;
+  }
+
+  return false;
+}
+
+/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
+  // This optimization only applies to wave32 targets where mbcnt.lo operates on
+  // the full execution mask.
+  if (!ST.isWave32())
+    return false;
+
+  // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
+  // lower IDs.
+  if (!match(&I,
+             m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_AllOnes(), m_Zero())))
+    return false;
+
+  return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
+}
+
+/// Optimize mbcnt.hi calls for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
+  // Abort if wave size is not known at compile time.
+  if (!ST.isWaveSizeKnown())
+    return false;
+
+  unsigned Wave = ST.getWavefrontSize();
+
+  // On wave32, the upper 32 bits of execution mask are always 0, so
+  // mbcnt.hi(mask, val) always returns val unchanged.
+  if (ST.isWave32()) {
+    if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+      // Replace mbcnt.hi(mask, val) with val only when work group size matches
+      // wave size (single wave per work group).
+      if (*MaybeX == Wave) {
+        BasicBlock::iterator BI(&I);
+        ReplaceInstWithValue(BI, I.getArgOperand(1));
+        return true;
+      }
+    }
+  }
+
+  // Optimize the complete lane ID computation pattern:
+  // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
+  // across the full execution mask.
+  using namespace PatternMatch;
+
+  // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
+  if (!match(&I, m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
+                     m_AllOnes(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
+                                      m_AllOnes(), m_Zero()))))
+    return false;
+
+  return tryReplaceWithWorkitemId(I, Wave);
+}
+
 char AMDGPUCodeGenPrepare::ID = 0;
 
 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8b211f..7f00ead 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[
   combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
 ]>;
 
+// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32),
+  [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)),
+   (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>;
+
+def combine_or_s64_s32 : GICombineRule<
+  (defs root:$dst),
+  (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst),
+  (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x),
+         (G_OR $or, $x_lo, $y),
+         (G_MERGE_VALUES $dst, $or, $x_hi))>;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<
   "AMDGPUPreLegalizerCombinerImpl",
   [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
    foldable_fneg, combine_shuffle_vector_to_build_vector,
-   binop_s64_with_s32_mask_combines]> {
+   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
@@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
   [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
    rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
-   binop_s64_with_s32_mask_combines]> {
+   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index d23521c..77be58c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -143,8 +143,7 @@ static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
 }
 
 static bool mayIgnoreSignedZero(MachineInstr &MI) {
-  const TargetOptions &Options = MI.getMF()->getTarget().Options;
-  return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
+  return MI.getFlag(MachineInstr::MIFlag::FmNsz);
 }
 
 static bool isInv2Pi(const APFloat &APF) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index d14b5ce..f538769 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 def FeatureFP64 : SubtargetFeature<"fp64",
-  "FP64",
+  "HasFP64",
   "true",
   "Enable double precision operations"
 >;
 
 def FeatureFMA : SubtargetFeature<"fmaf",
-  "FMA",
+  "HasFMA",
   "true",
   "Enable single precision FMA (not as fast as mul+add, but fused)"
 >;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index bb4bf74..cfef046 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -288,6 +288,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
 // FIXME: Check MMO is atomic
 def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
 def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
 def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
 def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
 
@@ -308,6 +310,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32, SIbuffer_atomic_csub>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
@@ -326,6 +329,12 @@ def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
 // G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
 // so we don't mark it as equivalent.
 
+def : GINodeEquiv<G_AMDGPU_SPONENTRY, sponentry>;
+
+def : GINodeEquiv<G_AMDGPU_FLAT_LOAD_MONITOR, AMDGPUflat_load_monitor>;
+def : GINodeEquiv<G_AMDGPU_GLOBAL_LOAD_MONITOR, AMDGPUglobal_load_monitor>;
+
+
 class GISelSop2Pat <
   SDPatternOperator node,
   Instruction inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1b4b113..6bad4dbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -131,7 +131,7 @@ protected:
 
 public:
   MetadataStreamerMsgPackV4() = default;
-  ~MetadataStreamerMsgPackV4() = default;
+  ~MetadataStreamerMsgPackV4() override = default;
 
   bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
 
@@ -154,7 +154,7 @@ protected:
 
 public:
   MetadataStreamerMsgPackV5() = default;
-  ~MetadataStreamerMsgPackV5() = default;
+  ~MetadataStreamerMsgPackV5() override = default;
 };
 
 class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
@@ -163,7 +163,7 @@ protected:
 
 public:
   MetadataStreamerMsgPackV6() = default;
-  ~MetadataStreamerMsgPackV6() = default;
+  ~MetadataStreamerMsgPackV6() override = default;
 
   void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
                        msgpack::MapDocNode Kern) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
new file mode 100644
index 0000000..37f8678
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp
@@ -0,0 +1,77 @@
+//===--- AMDGPUHazardLatency.cpp - AMDGPU Hazard Latency Adjustment -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to adjust the
+///       latency of data edges between instructions which use registers
+///       potentially subject to additional hazard waits not accounted
+///       for in the normal scheduling model.
+///       While the scheduling model is typically still accurate in these
+///       scenarios, adjusting latency of relevant edges can improve wait
+///       merging and reduce pipeline impact of any required waits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHazardLatency.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class HazardLatency : public ScheduleDAGMutation {
+private:
+  const GCNSubtarget &ST;
+  const SIRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+
+public:
+  HazardLatency(MachineFunction *MF)
+      : ST(MF->getSubtarget<GCNSubtarget>()), TRI(*ST.getRegisterInfo()),
+        MRI(MF->getRegInfo()) {}
+  void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void HazardLatency::apply(ScheduleDAGInstrs *DAG) {
+  constexpr unsigned MaskLatencyBoost = 3;
+
+  // Hazard only manifests in Wave64
+  if (!ST.hasVALUMaskWriteHazard() || !ST.isWave64())
+    return;
+
+  for (SUnit &SU : DAG->SUnits) {
+    const MachineInstr *MI = SU.getInstr();
+    if (!SIInstrInfo::isVALU(*MI))
+      continue;
+    if (MI->getOpcode() == AMDGPU::V_READLANE_B32 ||
+        MI->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
+      continue;
+    for (SDep &SuccDep : SU.Succs) {
+      if (SuccDep.isCtrl())
+        continue;
+      // Boost latency on VALU writes to SGPRs used by VALUs.
+      // Reduce risk of premature VALU pipeline stall on associated reads.
+      MachineInstr *DestMI = SuccDep.getSUnit()->getInstr();
+      if (!SIInstrInfo::isVALU(*DestMI))
+        continue;
+      Register Reg = SuccDep.getReg();
+      if (!TRI.isSGPRReg(MRI, Reg))
+        continue;
+      SuccDep.setLatency(SuccDep.getLatency() * MaskLatencyBoost);
+    }
+  }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF) {
+  return std::make_unique<HazardLatency>(MF);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h
new file mode 100644
index 0000000..134cc27
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h
@@ -0,0 +1,24 @@
+//===- AMDGPUHazardLatency.h - Hazard Latency Adjustment --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+class MachineFunction;
+
+std::unique_ptr<ScheduleDAGMutation>
+createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 5700468..10ffbe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -24,6 +24,8 @@
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 
+#include <type_traits>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "igrouplp"
@@ -1044,7 +1046,7 @@ private:
       if (!SyncPipe.size())
         return false;
 
-      auto SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
+      unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
         return Succ.getKind() == SDep::Data;
       });
       if (SuccSize >= Size)
@@ -1052,7 +1054,7 @@ private:
 
       if (HasIntermediary) {
         for (auto Succ : SU->Succs) {
-          auto SuccSize =
+          unsigned SuccSize =
               llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
                 return SuccSucc.getKind() == SDep::Data;
               });
@@ -1084,7 +1086,7 @@ private:
       if (!SyncPipe.size())
         return false;
 
-      auto SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
+      unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
         return Succ.getKind() == SDep::Data;
       });
       if (SuccSize >= Size)
@@ -1092,7 +1094,7 @@ private:
 
       if (HasIntermediary) {
         for (auto Succ : SU->Succs) {
-          auto SuccSize =
+          unsigned SuccSize =
               llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
                 return SuccSucc.getKind() == SDep::Data;
               });
@@ -1968,7 +1970,7 @@ private:
       int NumBits = 0;
 
       auto TRI = TII->getRegisterInfo();
-      auto &MRI = MI->getParent()->getParent()->getRegInfo();
+      auto &MRI = MI->getMF()->getRegInfo();
       for (auto &Elt : Collection) {
         auto Op = Elt->getInstr()->getOperand(0);
         auto Size =
@@ -2183,7 +2185,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
   // Interleave MFMA with DS_READ prefetch
-  for (unsigned I = 0; I < DSRCount - 4; ++I) {
+  for (unsigned I = 4; I < DSRCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2196,7 +2198,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   // Phase 2a: Loop carried dependency with V_PERM
   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
   // depend on. Interleave MFMA to keep XDL unit busy throughout.
-  for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+  for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
@@ -2233,7 +2235,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   // Phase 2b: Loop carried dependency without V_PERM
   // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
   // Interleave MFMA to keep XDL unit busy throughout.
-  for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+  for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
@@ -2391,6 +2393,61 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
   if (MI.isMetaInstruction())
     Result = false;
 
+  else if (MI.isInlineAsm()) {
+    const SIRegisterInfo &TRI = TII->getRegisterInfo();
+    auto &MRI = MI.getParent()->getParent()->getRegInfo();
+    bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false,
+         VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(),
+         MayStore = MI.mayStore();
+    for (const MachineOperand &Operand : MI.operands())
+      if (Operand.isReg()) {
+        const TargetRegisterClass &RegClass =
+            *TRI.getRegClassForOperandReg(MRI, Operand);
+        if (TRI.hasVGPRs(&RegClass)) {
+          VGPR_used = true;
+          if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
+            VReg32_used = true;
+        }
+        // > 128 bit registers are usually only used by MFMA instructions, so
+        // we're using that as a heuristic to guess the schedule group mask of
+        // the inline asm.
+        if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
+          VMFMA_used = true;
+        if (TRI.hasSGPRs(&RegClass))
+          SGPR_used = true;
+        if (TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
+          SGPR_big_def = true;
+      }
+
+    typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
+    SGMask_t InlineAsmMask = 0;
+    if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
+      InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
+    if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
+      InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
+    if (VMFMA_used)
+      InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
+    if (VGPR_used && MayLoad)
+      InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
+                                              : SchedGroupMask::VMEM_READ);
+    if (VGPR_used && MayStore)
+      InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
+                                              : SchedGroupMask::VMEM_WRITE);
+    if (SGPR_big_def)
+      InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
+    if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
+        InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
+      InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
+    if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
+        InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
+      InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
+    if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
+        InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
+      InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
+
+    Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
+  }
+
   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
             TII->isTRANS(MI)))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aff7096..0688f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -11,7 +11,6 @@
 
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include <memory>
-#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b8b419d..238f06f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -134,7 +134,7 @@ static SDValue stripExtractLoElt(SDValue In) {
 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
                       "AMDGPU DAG->DAG Pattern Instruction Selection", false,
                       false)
-INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfoWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 #ifdef EXPENSIVE_CHECKS
@@ -238,7 +238,7 @@ bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
 }
 
 void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AMDGPUArgumentUsageInfo>();
+  AU.addRequired<AMDGPUArgumentUsageInfoWrapperLegacy>();
   AU.addRequired<UniformityInfoWrapperPass>();
 #ifdef EXPENSIVE_CHECKS
   AU.addRequired<DominatorTreeWrapperPass>();
@@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
 }
 
+SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
+                                              SelectionDAG &DAG) const {
+  // TODO: Handle undef as zero
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+  uint32_t LHSVal, RHSVal;
+  if (getConstantValue(N->getOperand(0), LHSVal) &&
+      getConstantValue(N->getOperand(1), RHSVal)) {
+    SDLoc SL(N);
+    uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
+    return DAG.getMachineNode(
+        isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
+        N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
+  }
+
+  return nullptr;
+}
+
 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   EVT VT = N->getValueType(0);
   unsigned NumVectorElts = VT.getVectorNumElements();
@@ -708,10 +726,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       break;
     }
 
+    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
     assert(VT.getVectorElementType().bitsEq(MVT::i32));
-    unsigned RegClassID =
-        SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
-    SelectBuildVector(N, RegClassID);
+    const TargetRegisterClass *RegClass =
+        N->isDivergent()
+            ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
+            : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
+
+    SelectBuildVector(N, RegClass->getID());
     return;
   }
   case ISD::VECTOR_SHUFFLE:
@@ -1284,7 +1306,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
 
           // FIXME: Select to VOP3 version for with-carry.
           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
-          if (Subtarget->hasAddNoCarry()) {
+          if (Subtarget->hasAddNoCarryInsts()) {
             SubOp = AMDGPU::V_SUB_U32_e64;
             Opnds.push_back(
                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
@@ -1469,7 +1491,7 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
           Opnds.push_back(Zero);
           Opnds.push_back(Addr.getOperand(1));
           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
-          if (Subtarget->hasAddNoCarry()) {
+          if (Subtarget->hasAddNoCarryInsts()) {
             SubOp = AMDGPU::V_SUB_U32_e64;
             Opnds.push_back(
                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
@@ -1828,72 +1850,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
          isFlatScratchBaseLegal(Addr))) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
-      const SIInstrInfo *TII = Subtarget->getInstrInfo();
-      if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
-        Addr = N0;
-        OffsetVal = COffsetVal;
-      } else {
-        // If the offset doesn't fit, put the low bits into the offset field and
-        // add the rest.
-        //
-        // For a FLAT instruction the hardware decides whether to access
-        // global/scratch/shared memory based on the high bits of vaddr,
-        // ignoring the offset field, so we have to ensure that when we add
-        // remainder to vaddr it still points into the same underlying object.
-        // The easiest way to do that is to make sure that we split the offset
-        // into two pieces that are both >= 0 or both <= 0.
-
-        SDLoc DL(N);
-        uint64_t RemainderOffset;
-
-        std::tie(OffsetVal, RemainderOffset) =
-            TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
-
-        SDValue AddOffsetLo =
-            getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
-        SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
-        if (Addr.getValueType().getSizeInBits() == 32) {
-          SmallVector<SDValue, 3> Opnds;
-          Opnds.push_back(N0);
-          Opnds.push_back(AddOffsetLo);
-          unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
-          if (Subtarget->hasAddNoCarry()) {
-            AddOp = AMDGPU::V_ADD_U32_e64;
-            Opnds.push_back(Clamp);
-          }
-          Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+      // Adding the offset to the base address in a FLAT instruction must not
+      // change the memory aperture in which the address falls. Therefore we can
+      // only fold offsets from inbounds GEPs into FLAT instructions.
+      bool IsInBounds =
+          Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
+      if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
+        const SIInstrInfo *TII = Subtarget->getInstrInfo();
+        if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
+          Addr = N0;
+          OffsetVal = COffsetVal;
         } else {
-          // TODO: Should this try to use a scalar add pseudo if the base address
-          // is uniform and saddr is usable?
-          SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-          SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-
-          SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                                DL, MVT::i32, N0, Sub0);
-          SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                                DL, MVT::i32, N0, Sub1);
-
-          SDValue AddOffsetHi =
-              getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
-
-          SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-
-          SDNode *Add =
-              CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
-                                     {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
-
-          SDNode *Addc = CurDAG->getMachineNode(
-              AMDGPU::V_ADDC_U32_e64, DL, VTs,
-              {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
-
-          SDValue RegSequenceArgs[] = {
-              CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
-              SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
-
-          Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                                MVT::i64, RegSequenceArgs),
-                         0);
+          // If the offset doesn't fit, put the low bits into the offset field
+          // and add the rest.
+          //
+          // For a FLAT instruction the hardware decides whether to access
+          // global/scratch/shared memory based on the high bits of vaddr,
+          // ignoring the offset field, so we have to ensure that when we add
+          // remainder to vaddr it still points into the same underlying object.
+          // The easiest way to do that is to make sure that we split the offset
+          // into two pieces that are both >= 0 or both <= 0.
+
+          SDLoc DL(N);
+          uint64_t RemainderOffset;
+
+          std::tie(OffsetVal, RemainderOffset) =
+              TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
+
+          SDValue AddOffsetLo =
+              getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+          SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+          if (Addr.getValueType().getSizeInBits() == 32) {
+            SmallVector<SDValue, 3> Opnds;
+            Opnds.push_back(N0);
+            Opnds.push_back(AddOffsetLo);
+            unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+            if (Subtarget->hasAddNoCarryInsts()) {
+              AddOp = AMDGPU::V_ADD_U32_e64;
+              Opnds.push_back(Clamp);
+            }
+            Addr =
+                SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+          } else {
+            // TODO: Should this try to use a scalar add pseudo if the base
+            // address is uniform and saddr is usable?
+            SDValue Sub0 =
+                CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+            SDValue Sub1 =
+                CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+            SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                  DL, MVT::i32, N0, Sub0);
+            SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                  DL, MVT::i32, N0, Sub1);
+
+            SDValue AddOffsetHi =
+                getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+            SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+
+            SDNode *Add =
+                CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+                                       {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+            SDNode *Addc = CurDAG->getMachineNode(
+                AMDGPU::V_ADDC_U32_e64, DL, VTs,
+                {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+            SDValue RegSequenceArgs[] = {
+                CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
+                                          MVT::i32),
+                SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+
+            Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                  MVT::i64, RegSequenceArgs),
+                           0);
+          }
         }
       }
     }
@@ -3047,9 +3080,38 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
 
   const unsigned Opc = gwsIntrinToOpcode(IntrID);
+
+  const MCInstrDesc &InstrDesc = TII->get(Opc);
+  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+
+  const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
+
   SmallVector<SDValue, 5> Ops;
-  if (HasVSrc)
-    Ops.push_back(N->getOperand(2));
+  if (HasVSrc) {
+    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+    SDValue Data = N->getOperand(2);
+    MVT DataVT = Data.getValueType().getSimpleVT();
+    if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
+      // Normal 32-bit case.
+      Ops.push_back(N->getOperand(2));
+    } else {
+      // Operand is really 32-bits, but requires 64-bit alignment, so use the
+      // even aligned 64-bit register class.
+      const SDValue RegSeqOps[] = {
+          CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
+          CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+          SDValue(
+              CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
+              0),
+          CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
+
+      Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                                   SL, MVT::v2i32, RegSeqOps),
+                            0));
+    }
+  }
+
   Ops.push_back(OffsetField);
   Ops.push_back(Chain);
 
@@ -4222,7 +4284,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
     SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
     if (!getOperandBits(LHS, LHSBits) ||
         !getOperandBits(RHS, RHSBits)) {
-      Src = Backup;
+      Src = std::move(Backup);
       return std::make_pair(0, 0);
     }
 
@@ -4387,16 +4449,23 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
 
 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
   const auto *Ld = cast<LoadSDNode>(N);
-
   const MachineMemOperand *MMO = Ld->getMemOperand();
-  if (N->isDivergent() && !AMDGPU::isUniformMMO(MMO))
+
+  // FIXME: We ought to able able to take the direct isDivergent result. We
+  // cannot rely on the MMO for a uniformity check, and should stop using
+  // it. This is a hack for 2 ways that the IR divergence analysis is superior
+  // to the DAG divergence: Recognizing shift-of-workitem-id as always
+  // uniform, and isSingleLaneExecution. These should be handled in the DAG
+  // version, and then this can be dropped.
+  if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
     return false;
 
   return MMO->getSize().hasValue() &&
          Ld->getAlign() >=
              Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
                             uint64_t(4))) &&
-         ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+         (MMO->isInvariant() ||
+          (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
            Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
           (Subtarget->getScalarizeGlobalBehavior() &&
            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 4fa0d3f..a86b754 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H
 
+#include "AMDGPUSelectionDAGInfo.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIModeRegisterDefaults.h"
@@ -45,21 +46,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
   return false;
 }
 
-// TODO: Handle undef as zero
-static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
-  assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
-  uint32_t LHSVal, RHSVal;
-  if (getConstantValue(N->getOperand(0), LHSVal) &&
-      getConstantValue(N->getOperand(1), RHSVal)) {
-    SDLoc SL(N);
-    uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
-    return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
-                              DAG.getTargetConstant(K, SL, MVT::i32));
-  }
-
-  return nullptr;
-}
-
 /// AMDGPU specific code to select AMDGPU machine instructions for
 /// SelectionDAG operations.
 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
@@ -115,6 +101,8 @@ private:
 
   MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
 
+  SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const;
+
   SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
   SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
   SDNode *glueCopyToM0LDSInit(SDNode *N) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a6..da21033 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPUMemoryUtils.h"
+#include "AMDGPUSelectionDAGInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
@@ -59,8 +60,9 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
 }
 
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
-                                           const AMDGPUSubtarget &STI)
-    : TargetLowering(TM), Subtarget(&STI) {
+                                           const TargetSubtargetInfo &STI,
+                                           const AMDGPUSubtarget &AMDGPUSTI)
+    : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
   // Always lower memset, memcpy, and memmove intrinsics to load/store
   // instructions, rather then generating calls to memset, mempcy or memmove.
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
@@ -336,6 +338,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
   setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
   setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
   setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
@@ -424,22 +427,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                      Expand);
 
   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
-
-  if (Subtarget->has16BitInsts()) {
-    setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
-    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
-  } else {
-    setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
-    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
-  }
+  setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
+  setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
 
   setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
                      Custom);
 
   setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
-  if (Subtarget->has16BitInsts()) {
-    setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal);
-  }
 
   // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
   // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
@@ -451,11 +445,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                       MVT::v16f64},
                      Custom);
 
-  if (isTypeLegal(MVT::f16))
-    setOperationAction(ISD::IS_FPCLASS,
-                       {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
-                       Custom);
-
   // Expand to fneg + fadd.
   setOperationAction(ISD::FSUB, MVT::f64, Expand);
 
@@ -478,7 +467,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
        MVT::v4i64,  MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64},
       Custom);
 
-  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction({ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, MVT::f64,
+                     Expand);
   setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
@@ -502,16 +492,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // The hardware supports 32-bit FSHR, but not FSHL.
   setOperationAction(ISD::FSHR, MVT::i32, Legal);
 
-  // The hardware supports 32-bit ROTR, but not ROTL.
-  setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
-  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
 
   setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
 
   setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
-  setOperationAction(
-      {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
-      MVT::i64, Custom);
+  setOperationAction({ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
+                      ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+                      ISD::FP_TO_UINT_SAT},
+                     MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
   setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
@@ -530,19 +519,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   for (MVT VT : VectorIntTypes) {
     // Expand the following operations for the current type by default.
-    setOperationAction({ISD::ADD,        ISD::AND,          ISD::FP_TO_SINT,
-                        ISD::FP_TO_UINT, ISD::MUL,          ISD::MULHU,
-                        ISD::MULHS,      ISD::OR,           ISD::SHL,
-                        ISD::SRA,        ISD::SRL,          ISD::ROTL,
-                        ISD::ROTR,       ISD::SUB,          ISD::SINT_TO_FP,
-                        ISD::UINT_TO_FP, ISD::SDIV,         ISD::UDIV,
-                        ISD::SREM,       ISD::UREM,         ISD::SMUL_LOHI,
-                        ISD::UMUL_LOHI,  ISD::SDIVREM,      ISD::UDIVREM,
-                        ISD::SELECT,     ISD::VSELECT,      ISD::SELECT_CC,
-                        ISD::XOR,        ISD::BSWAP,        ISD::CTPOP,
-                        ISD::CTTZ,       ISD::CTLZ,         ISD::VECTOR_SHUFFLE,
-                        ISD::SETCC,      ISD::ADDRSPACECAST},
+    // clang-format off
+    setOperationAction({ISD::ADD,            ISD::AND,
+                        ISD::FP_TO_SINT,     ISD::FP_TO_UINT,
+                        ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
+                        ISD::MUL,            ISD::MULHU,
+                        ISD::MULHS,          ISD::OR,
+                        ISD::SHL,            ISD::SRA,
+                        ISD::SRL,            ISD::ROTL,
+                        ISD::ROTR,           ISD::SUB,
+                        ISD::SINT_TO_FP,     ISD::UINT_TO_FP,
+                        ISD::SDIV,           ISD::UDIV,
+                        ISD::SREM,           ISD::UREM,
+                        ISD::SMUL_LOHI,      ISD::UMUL_LOHI,
+                        ISD::SDIVREM,        ISD::UDIVREM,
+                        ISD::SELECT,         ISD::VSELECT,
+                        ISD::SELECT_CC,      ISD::XOR,
+                        ISD::BSWAP,          ISD::CTPOP,
+                        ISD::CTTZ,           ISD::CTLZ,
+                        ISD::VECTOR_SHUFFLE, ISD::SETCC,
+                        ISD::ADDRSPACECAST},
                        VT, Expand);
+    // clang-format on
   }
 
   static const MVT::SimpleValueType FloatVectorTypes[] = {
@@ -643,9 +641,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 }
 
 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
-  if (getTargetMachine().Options.NoSignedZerosFPMath)
-    return true;
-
   const auto Flags = Op.getNode()->getFlags();
   if (Flags.hasNoSignedZeros())
     return true;
@@ -820,9 +815,7 @@ bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
 // FIXME: Why are we reporting vectors of FP immediates as legal?
 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                         bool ForCodeSize) const {
-  EVT ScalarVT = VT.getScalarType();
-  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
-         (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
+  return isTypeLegal(VT.getScalarType());
 }
 
 // We don't want to shrink f64 / f32 constants.
@@ -966,8 +959,8 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
   assert(VT.isFloatingPoint());
 
   // Packed operations do not have a fabs modifier.
-  return VT == MVT::f32 || VT == MVT::f64 ||
-         (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
+  // Report this based on the end legalized type.
+  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
 }
 
 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
@@ -1056,8 +1049,9 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
-    if (Subtarget->has16BitInsts() &&
-        (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
+    if (isTypeLegal(MVT::i16) &&
+        (!DestVT.isVector() ||
+         !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
       // Don't narrow back down to i16 if promoted to i32 already.
       if (!N->isDivergent() && DestVT.isInteger() &&
           DestVT.getScalarSizeInBits() > 1 &&
@@ -1216,9 +1210,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
   const SmallVectorImpl<ISD::InputArg> &Ins) const {
   const MachineFunction &MF = State.getMachineFunction();
   const Function &Fn = MF.getFunction();
-  LLVMContext &Ctx = Fn.getParent()->getContext();
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
-  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
+  LLVMContext &Ctx = Fn.getContext();
+  const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
   CallingConv::ID CC = Fn.getCallingConv();
 
   Align MaxAlign = Align(1);
@@ -1248,7 +1241,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
     SmallVector<EVT, 16> ValueVTs;
     SmallVector<uint64_t, 16> Offsets;
-    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
+                    &Offsets, ArgOffset);
 
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
@@ -1409,7 +1403,12 @@ SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
       InVals.push_back(DAG.getPOISON(Arg.VT));
   }
 
-  return DAG.getEntryNode();
+  // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
+  if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
+    return CLI.Chain;
+
+  SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
+  return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
 }
 
 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -1465,6 +1464,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return LowerFP_TO_INT(Op, DAG);
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    return LowerFP_TO_INT_SAT(Op, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTLZ:
@@ -1528,7 +1530,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     if (std::optional<uint32_t> Address =
             AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
       if (IsNamedBarrier) {
-        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+        unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
         MFI->recordNumNamedBarriers(Address.value(), BarCnt);
       }
       return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
@@ -1885,14 +1887,14 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   Align BaseAlign = Load->getAlign();
   Align HiAlign = commonAlignment(BaseAlign, Size);
 
-  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
-                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
-                                  BaseAlign, Load->getMemOperand()->getFlags());
+  SDValue LoLoad = DAG.getExtLoad(
+      Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
+      LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
-  SDValue HiLoad =
-      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
-                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
-                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
+  SDValue HiLoad = DAG.getExtLoad(
+      Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
+      SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
+      Load->getMemOperand()->getFlags(), Load->getAAInfo());
 
   SDValue Join;
   if (LoVT == HiVT) {
@@ -1980,10 +1982,10 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
 
   SDValue LoStore =
       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
-                        Store->getMemOperand()->getFlags());
-  SDValue HiStore =
-      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
-                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
+                        Store->getMemOperand()->getFlags(), Store->getAAInfo());
+  SDValue HiStore = DAG.getTruncStore(
+      Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
+      Store->getMemOperand()->getFlags(), Store->getAAInfo());
 
   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
 }
@@ -2628,11 +2630,18 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
     return Src.getOperand(0).getValueType() == MVT::f16;
   case ISD::FP16_TO_FP:
   case ISD::FFREXP:
+  case ISD::FSQRT:
+  case AMDGPUISD::LOG:
+  case AMDGPUISD::EXP:
     return true;
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID = Src.getConstantOperandVal(0);
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_frexp_mant:
+    case Intrinsic::amdgcn_log:
+    case Intrinsic::amdgcn_log_clamp:
+    case Intrinsic::amdgcn_exp2:
+    case Intrinsic::amdgcn_sqrt:
       return true;
     default:
       return false;
@@ -2731,7 +2740,7 @@ SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
 
   if (VT == MVT::f16) {
     // Nothing in half is a denormal when promoted to f32.
-    assert(!Subtarget->has16BitInsts());
+    assert(!isTypeLegal(VT));
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
     SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
     return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
@@ -2764,20 +2773,18 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
   EVT VT = Op.getValueType();
   SDNodeFlags Flags = Op->getFlags();
   SDLoc DL(Op);
-
   const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
   assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
 
-  const auto &Options = getTargetMachine().Options;
   if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
 
-    if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
+    if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
       // Log and multiply in f32 is good enough for f16.
       X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
     }
 
     SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
-    if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
+    if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
       return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
                          DAG.getTargetConstant(0, DL, MVT::i32), Flags);
     }
@@ -2803,7 +2810,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
 
     SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
     SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
-
+    // This adds correction terms for which contraction may lead to an increase
+    // in the error of the approximation, so disable it.
+    Flags.setAllowContract(false);
     R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
     SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
     SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
@@ -2826,15 +2835,16 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
     SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
     SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
     SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
-
+    // This adds correction terms for which contraction may lead to an increase
+    // in the error of the approximation, so disable it.
+    Flags.setAllowContract(false);
     SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
     SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
     SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
     R = getMad(DAG, DL, VT, YH, CH, Mad1);
   }
 
-  const bool IsFiniteOnly =
-      (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
+  const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
 
   // TODO: Check if known finite from source value.
   if (!IsFiniteOnly) {
@@ -2910,7 +2920,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
 
   if (VT == MVT::f16) {
     // Nothing in half is a denormal when promoted to f32.
-    assert(!Subtarget->has16BitInsts());
+    assert(!isTypeLegal(MVT::f16));
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
     SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
     return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
@@ -2950,19 +2960,28 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
 }
 
+SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
+                                                  SelectionDAG &DAG,
+                                                  SDNodeFlags Flags,
+                                                  bool IsExp10) const {
+  // exp(x) -> exp2(M_LOG2E_F * x);
+  // exp10(x) -> exp2(log2(10) * x);
+  EVT VT = X.getValueType();
+  SDValue Const =
+      DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
+  return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
+                                    : (unsigned)ISD::FEXP2,
+                     SL, VT, Mul, Flags);
+}
+
 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
                                               SelectionDAG &DAG,
                                               SDNodeFlags Flags) const {
   EVT VT = X.getValueType();
-  const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
-
-  if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
-    // exp2(M_LOG2E_F * f);
-    SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
-    return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
-                                      : (unsigned)ISD::FEXP2,
-                       SL, VT, Mul, Flags);
-  }
+  if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
+    return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
 
   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
@@ -2976,6 +2995,7 @@ SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
   SDValue AdjustedX =
       DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
 
+  const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
   SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
 
   SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
@@ -2994,6 +3014,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
                                                 SelectionDAG &DAG,
                                                 SDNodeFlags Flags) const {
   const EVT VT = X.getValueType();
+
   const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
                                          : static_cast<unsigned>(ISD::FEXP2);
 
@@ -3050,33 +3071,32 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
   SDNodeFlags Flags = Op->getFlags();
   const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
 
-  if (VT.getScalarType() == MVT::f16) {
-    // v_exp_f16 (fmul x, log2e)
-    if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
-      return lowerFEXPUnsafe(X, SL, DAG, Flags);
+  // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
+  // library behavior. Also, is known-not-daz source sufficient?
+  if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
+    return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
+                   : lowerFEXPUnsafe(X, SL, DAG, Flags);
+  }
 
+  if (VT.getScalarType() == MVT::f16) {
     if (VT.isVector())
       return SDValue();
 
+    // Nothing in half is a denormal when promoted to f32.
+    //
     // exp(f16 x) ->
     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
-
-    // Nothing in half is a denormal when promoted to f32.
+    //
+    // exp10(f16 x) ->
+    //   fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
-    SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
+    SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
     return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
                        DAG.getTargetConstant(0, SL, MVT::i32), Flags);
   }
 
   assert(VT == MVT::f32);
 
-  // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
-  // library behavior. Also, is known-not-daz source sufficient?
-  if (allowApproxFunc(DAG, Flags)) {
-    return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
-                   : lowerFEXPUnsafe(X, SL, DAG, Flags);
-  }
-
   //    Algorithm:
   //
   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
@@ -3369,8 +3389,9 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
   // Get the 32-bit normalized integer.
   Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
   // Convert the normalized 32-bit integer into f32.
-  unsigned Opc =
-      (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+
+  bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
+  unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
   SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
 
   // Finally, need to scale back the converted floating number as the original
@@ -3378,7 +3399,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
   ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
                       ShAmt);
   // On GCN, use LDEXP directly.
-  if (Subtarget->isGCN())
+  if (UseLDEXP)
     return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
 
   // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
@@ -3445,7 +3466,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (SrcVT != MVT::i64)
     return Op;
 
-  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+  if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
     SDLoc DL(Op);
 
     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
@@ -3493,7 +3514,7 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   // TODO: Factor out code common with LowerUINT_TO_FP.
 
-  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+  if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
     SDLoc DL(Op);
     SDValue Src = Op.getOperand(0);
 
@@ -3737,6 +3758,86 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
   return SDValue();
 }
 
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+  unsigned OpOpcode = Op.getOpcode();
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Op.getValueType();
+  SDValue SatVTOp = Op.getNode()->getOperand(1);
+  EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
+  SDLoc DL(Op);
+
+  uint64_t DstWidth = DstVT.getScalarSizeInBits();
+  uint64_t SatWidth = SatVT.getScalarSizeInBits();
+  assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
+
+  // Will be selected natively
+  if (DstVT == MVT::i32 && SatWidth == DstWidth &&
+      (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+    return Op;
+
+  const SDValue Int32VT = DAG.getValueType(MVT::i32);
+
+  // Perform all saturation at i32 and truncate
+  if (SatWidth < DstWidth) {
+    const uint64_t Int32Width = 32;
+    SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, Int32VT);
+    SDValue Int32SatVal;
+
+    if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
+      SDValue MinConst = DAG.getConstant(
+          APInt::getSignedMaxValue(SatWidth).sext(Int32Width), DL, MVT::i32);
+      SDValue MaxConst = DAG.getConstant(
+          APInt::getSignedMinValue(SatWidth).sext(Int32Width), DL, MVT::i32);
+      SDValue MinVal =
+          DAG.getNode(ISD::SMIN, DL, MVT::i32, FpToInt32, MinConst);
+      Int32SatVal = DAG.getNode(ISD::SMAX, DL, MVT::i32, MinVal, MaxConst);
+    } else {
+      SDValue MinConst = DAG.getConstant(
+          APInt::getMaxValue(SatWidth).zext(Int32Width), DL, MVT::i32);
+      Int32SatVal = DAG.getNode(ISD::UMIN, DL, MVT::i32, FpToInt32, MinConst);
+    }
+
+    if (DstWidth == Int32Width)
+      return Int32SatVal;
+    if (DstWidth < Int32Width)
+      return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Int32SatVal);
+
+    // DstWidth > Int32Width
+    const unsigned Ext =
+        OpOpcode == ISD::FP_TO_SINT_SAT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    return DAG.getNode(Ext, DL, DstVT, FpToInt32);
+  }
+
+  // SatWidth == DstWidth
+
+  // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
+  if (DstVT == MVT::i64 &&
+      (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
+       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
+    return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VT);
+  }
+
+  // Promote f16/bf16 src to f32
+  if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) {
+    SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+    return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
+  }
+
+  // Promote sub-i32 dst to i32 with sub-i32 saturation
+  if (DstWidth < 32) {
+    // Note: this triggers SatWidth < DstWidth above to generate saturated
+    // truncate by requesting MVT::i32 destination with SatWidth < 32.
+    SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src, SatVTOp);
+    return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt32);
+  }
+
+  // TODO: can we implement i64 dst for f32/f64?
+
+  return SDValue();
+}
+
 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                      SelectionDAG &DAG) const {
   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
@@ -4125,8 +4226,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
 
   EVT ElementType = VT.getScalarType();
   EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
-  EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
-                                 : TargetScalarType;
+  EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
 
   if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
     return SDValue();
@@ -4190,8 +4290,7 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
 
   EVT ElementType = VT.getScalarType();
   EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
-  EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
-                                 : TargetScalarType;
+  EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
 
   if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
     return SDValue();
@@ -4312,8 +4411,7 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
 
   EVT ElementType = VT.getScalarType();
   EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
-  EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
-                                 : TargetScalarType;
+  EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
 
   if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
     return SDValue();
@@ -4547,7 +4645,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   }
 
   // There are i16 integer mul/mad.
-  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
+  if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
     return SDValue();
 
   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
@@ -4666,7 +4764,7 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
 
-  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
+  if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
     return SDValue();
 
   // Don't generate 24-bit multiplies on values that are in SGPRs, since
@@ -4675,7 +4773,7 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
   // value is in an SGPR.
   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
   // valu op anyway)
-  if (Subtarget->hasSMulHi() && !N->isDivergent())
+  if (!N->isDivergent() && Subtarget->hasSMulHi())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -4700,9 +4798,7 @@ SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
                                           const SDLoc &DL,
                                           unsigned Opc) const {
   EVT VT = Op.getValueType();
-  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
-  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
-                              LegalVT != MVT::i16))
+  if (VT.bitsGT(MVT::i32))
     return SDValue();
 
   if (VT != MVT::i32)
@@ -4999,7 +5095,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   SDLoc SL(N);
   switch (Opc) {
   case ISD::FADD: {
-    if (!mayIgnoreSignedZero(N0))
+    if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
       return SDValue();
 
     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
@@ -5047,7 +5143,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   case ISD::FMA:
   case ISD::FMAD: {
     // TODO: handle llvm.amdgcn.fma.legacy
-    if (!mayIgnoreSignedZero(N0))
+    if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
       return SDValue();
 
     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
@@ -5259,7 +5355,7 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
 
   switch (N0.getOpcode()) {
   case ISD::FP16_TO_FP: {
-    assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
+    assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
     SDLoc SL(N);
     SDValue Src = N0.getOperand(0);
     EVT SrcVT = Src.getValueType();
@@ -5459,7 +5555,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     if ((OffsetVal + WidthVal) >= 32 &&
-        !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
+        !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
                          BitsFrom, ShiftVal);
@@ -5649,169 +5745,6 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
   return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
 }
 
-#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
-
-const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((AMDGPUISD::NodeType)Opcode) {
-  case AMDGPUISD::FIRST_NUMBER: break;
-  // AMDIL DAG nodes
-  NODE_NAME_CASE(BRANCH_COND);
-
-  // AMDGPU DAG nodes
-  NODE_NAME_CASE(IF)
-  NODE_NAME_CASE(ELSE)
-  NODE_NAME_CASE(LOOP)
-  NODE_NAME_CASE(CALL)
-  NODE_NAME_CASE(TC_RETURN)
-  NODE_NAME_CASE(TC_RETURN_GFX)
-  NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
-  NODE_NAME_CASE(TC_RETURN_CHAIN)
-  NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
-  NODE_NAME_CASE(TRAP)
-  NODE_NAME_CASE(RET_GLUE)
-  NODE_NAME_CASE(WAVE_ADDRESS)
-  NODE_NAME_CASE(RETURN_TO_EPILOG)
-  NODE_NAME_CASE(ENDPGM)
-  NODE_NAME_CASE(ENDPGM_TRAP)
-  NODE_NAME_CASE(SIMULATED_TRAP)
-  NODE_NAME_CASE(DWORDADDR)
-  NODE_NAME_CASE(FRACT)
-  NODE_NAME_CASE(SETCC)
-  NODE_NAME_CASE(DENORM_MODE)
-  NODE_NAME_CASE(FMA_W_CHAIN)
-  NODE_NAME_CASE(FMUL_W_CHAIN)
-  NODE_NAME_CASE(CLAMP)
-  NODE_NAME_CASE(COS_HW)
-  NODE_NAME_CASE(SIN_HW)
-  NODE_NAME_CASE(FMAX_LEGACY)
-  NODE_NAME_CASE(FMIN_LEGACY)
-  NODE_NAME_CASE(FMAX3)
-  NODE_NAME_CASE(SMAX3)
-  NODE_NAME_CASE(UMAX3)
-  NODE_NAME_CASE(FMIN3)
-  NODE_NAME_CASE(SMIN3)
-  NODE_NAME_CASE(UMIN3)
-  NODE_NAME_CASE(FMED3)
-  NODE_NAME_CASE(SMED3)
-  NODE_NAME_CASE(UMED3)
-  NODE_NAME_CASE(FMAXIMUM3)
-  NODE_NAME_CASE(FMINIMUM3)
-  NODE_NAME_CASE(FDOT2)
-  NODE_NAME_CASE(URECIP)
-  NODE_NAME_CASE(DIV_SCALE)
-  NODE_NAME_CASE(DIV_FMAS)
-  NODE_NAME_CASE(DIV_FIXUP)
-  NODE_NAME_CASE(FMAD_FTZ)
-  NODE_NAME_CASE(RCP)
-  NODE_NAME_CASE(RSQ)
-  NODE_NAME_CASE(RCP_LEGACY)
-  NODE_NAME_CASE(RCP_IFLAG)
-  NODE_NAME_CASE(LOG)
-  NODE_NAME_CASE(EXP)
-  NODE_NAME_CASE(FMUL_LEGACY)
-  NODE_NAME_CASE(RSQ_CLAMP)
-  NODE_NAME_CASE(FP_CLASS)
-  NODE_NAME_CASE(DOT4)
-  NODE_NAME_CASE(CARRY)
-  NODE_NAME_CASE(BORROW)
-  NODE_NAME_CASE(BFE_U32)
-  NODE_NAME_CASE(BFE_I32)
-  NODE_NAME_CASE(BFI)
-  NODE_NAME_CASE(BFM)
-  NODE_NAME_CASE(FFBH_U32)
-  NODE_NAME_CASE(FFBH_I32)
-  NODE_NAME_CASE(FFBL_B32)
-  NODE_NAME_CASE(MUL_U24)
-  NODE_NAME_CASE(MUL_I24)
-  NODE_NAME_CASE(MULHI_U24)
-  NODE_NAME_CASE(MULHI_I24)
-  NODE_NAME_CASE(MAD_U24)
-  NODE_NAME_CASE(MAD_I24)
-  NODE_NAME_CASE(MAD_I64_I32)
-  NODE_NAME_CASE(MAD_U64_U32)
-  NODE_NAME_CASE(PERM)
-  NODE_NAME_CASE(TEXTURE_FETCH)
-  NODE_NAME_CASE(R600_EXPORT)
-  NODE_NAME_CASE(CONST_ADDRESS)
-  NODE_NAME_CASE(REGISTER_LOAD)
-  NODE_NAME_CASE(REGISTER_STORE)
-  NODE_NAME_CASE(CVT_F32_UBYTE0)
-  NODE_NAME_CASE(CVT_F32_UBYTE1)
-  NODE_NAME_CASE(CVT_F32_UBYTE2)
-  NODE_NAME_CASE(CVT_F32_UBYTE3)
-  NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
-  NODE_NAME_CASE(CVT_PKNORM_I16_F32)
-  NODE_NAME_CASE(CVT_PKNORM_U16_F32)
-  NODE_NAME_CASE(CVT_PK_I16_I32)
-  NODE_NAME_CASE(CVT_PK_U16_U32)
-  NODE_NAME_CASE(FP_TO_FP16)
-  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
-  NODE_NAME_CASE(CONST_DATA_PTR)
-  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
-  NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
-  NODE_NAME_CASE(LDS)
-  NODE_NAME_CASE(DUMMY_CHAIN)
-  NODE_NAME_CASE(LOAD_D16_HI)
-  NODE_NAME_CASE(LOAD_D16_LO)
-  NODE_NAME_CASE(LOAD_D16_HI_I8)
-  NODE_NAME_CASE(LOAD_D16_HI_U8)
-  NODE_NAME_CASE(LOAD_D16_LO_I8)
-  NODE_NAME_CASE(LOAD_D16_LO_U8)
-  NODE_NAME_CASE(STORE_MSKOR)
-  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
-  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
-  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
-  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
-  NODE_NAME_CASE(DS_ORDERED_COUNT)
-  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
-  NODE_NAME_CASE(BUFFER_LOAD)
-  NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
-  NODE_NAME_CASE(BUFFER_LOAD_USHORT)
-  NODE_NAME_CASE(BUFFER_LOAD_BYTE)
-  NODE_NAME_CASE(BUFFER_LOAD_SHORT)
-  NODE_NAME_CASE(BUFFER_LOAD_TFE)
-  NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
-  NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
-  NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
-  NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
-  NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
-  NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
-  NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
-  NODE_NAME_CASE(SBUFFER_LOAD)
-  NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
-  NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
-  NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
-  NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
-  NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
-  NODE_NAME_CASE(BUFFER_STORE)
-  NODE_NAME_CASE(BUFFER_STORE_BYTE)
-  NODE_NAME_CASE(BUFFER_STORE_SHORT)
-  NODE_NAME_CASE(BUFFER_STORE_FORMAT)
-  NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
-  NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
-  NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
-  NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
-  NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
-  NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
-  NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
-  NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
-  NODE_NAME_CASE(BUFFER_ATOMIC_AND)
-  NODE_NAME_CASE(BUFFER_ATOMIC_OR)
-  NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
-  NODE_NAME_CASE(BUFFER_ATOMIC_INC)
-  NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
-  NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
-  NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
-  NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
-  NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
-  NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
-  NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
-  NODE_NAME_CASE(WHOLE_WAVE_SETUP)
-  NODE_NAME_CASE(WHOLE_WAVE_RETURN)
-  }
-  return nullptr;
-}
-
 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
                                               SelectionDAG &DAG, int Enabled,
                                               int &RefinementSteps,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bdaf486..adbc2c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -51,7 +51,6 @@ protected:
   /// Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
-  SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
@@ -78,6 +77,9 @@ protected:
                           bool IsLog10, SDNodeFlags Flags) const;
   SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
+                              SDNodeFlags Flags, bool IsExp10) const;
+
   SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
                           SDNodeFlags Flags) const;
   SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
@@ -96,6 +98,7 @@ protected:
   SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
                             SelectionDAG &DAG) const;
@@ -180,7 +183,8 @@ protected:
     const SmallVectorImpl<ISD::InputArg> &Ins) const;
 
 public:
-  AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+  AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI,
+                       const AMDGPUSubtarget &AMDGPUSTI);
 
   bool mayIgnoreSignedZero(SDValue Op) const;
 
@@ -280,8 +284,6 @@ public:
                                SDValue RHS, SDValue True, SDValue False,
                                SDValue CC, DAGCombinerInfo &DCI) const;
 
-  const char* getTargetNodeName(unsigned Opcode) const override;
-
   // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for
   // AMDGPU.  Commit r319036,
   // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6)
@@ -406,235 +408,6 @@ public:
   }
 };
 
-namespace AMDGPUISD {
-
-enum NodeType : unsigned {
-  // AMDIL ISD Opcodes
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  BRANCH_COND,
-  // End AMDIL ISD Opcodes
-
-  // Function call.
-  CALL,
-  TC_RETURN,
-  TC_RETURN_GFX,
-  TC_RETURN_GFX_WholeWave,
-  TC_RETURN_CHAIN,
-  TC_RETURN_CHAIN_DVGPR,
-  TRAP,
-
-  // Masked control flow nodes.
-  IF,
-  ELSE,
-  LOOP,
-
-  // A uniform kernel return that terminates the wavefront.
-  ENDPGM,
-
-  // s_endpgm, but we may want to insert it in the middle of the block.
-  ENDPGM_TRAP,
-
-  // "s_trap 2" equivalent on hardware that does not support it.
-  SIMULATED_TRAP,
-
-  // Return to a shader part's epilog code.
-  RETURN_TO_EPILOG,
-
-  // Return with values from a non-entry function.
-  RET_GLUE,
-
-  // Convert a unswizzled wave uniform stack address to an address compatible
-  // with a vector offset for use in stack access.
-  WAVE_ADDRESS,
-
-  DWORDADDR,
-  FRACT,
-
-  /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
-  /// modifier behavior with dx10_enable.
-  CLAMP,
-
-  // This is SETCC with the full mask result which is used for a compare with a
-  // result bit per item in the wavefront.
-  SETCC,
-
-  DENORM_MODE,
-
-  // FP ops with input and output chain.
-  FMA_W_CHAIN,
-  FMUL_W_CHAIN,
-
-  // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
-  // Denormals handled on some parts.
-  COS_HW,
-  SIN_HW,
-  FMAX_LEGACY,
-  FMIN_LEGACY,
-
-  FMAX3,
-  SMAX3,
-  UMAX3,
-  FMIN3,
-  SMIN3,
-  UMIN3,
-  FMED3,
-  SMED3,
-  UMED3,
-  FMAXIMUM3,
-  FMINIMUM3,
-  FDOT2,
-  URECIP,
-  DIV_SCALE,
-  DIV_FMAS,
-  DIV_FIXUP,
-  // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
-  // treated as an illegal operation.
-  FMAD_FTZ,
-
-  // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
-  //            For f64, max error 2^29 ULP, handles denormals.
-  RCP,
-  RSQ,
-  RCP_LEGACY,
-  RCP_IFLAG,
-
-  // log2, no denormal handling for f32.
-  LOG,
-
-  // exp2, no denormal handling for f32.
-  EXP,
-
-  FMUL_LEGACY,
-  RSQ_CLAMP,
-  FP_CLASS,
-  DOT4,
-  CARRY,
-  BORROW,
-  BFE_U32,  // Extract range of bits with zero extension to 32-bits.
-  BFE_I32,  // Extract range of bits with sign extension to 32-bits.
-  BFI,      // (src0 & src1) | (~src0 & src2)
-  BFM,      // Insert a range of bits into a 32-bit word.
-  FFBH_U32, // ctlz with -1 if input is zero.
-  FFBH_I32,
-  FFBL_B32, // cttz with -1 if input is zero.
-  MUL_U24,
-  MUL_I24,
-  MULHI_U24,
-  MULHI_I24,
-  MAD_U24,
-  MAD_I24,
-  MAD_U64_U32,
-  MAD_I64_I32,
-  PERM,
-  TEXTURE_FETCH,
-  R600_EXPORT,
-  CONST_ADDRESS,
-  REGISTER_LOAD,
-  REGISTER_STORE,
-
-  // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
-  CVT_F32_UBYTE0,
-  CVT_F32_UBYTE1,
-  CVT_F32_UBYTE2,
-  CVT_F32_UBYTE3,
-
-  // Convert two float 32 numbers into a single register holding two packed f16
-  // with round to zero.
-  CVT_PKRTZ_F16_F32,
-  CVT_PKNORM_I16_F32,
-  CVT_PKNORM_U16_F32,
-  CVT_PK_I16_I32,
-  CVT_PK_U16_U32,
-
-  // Same as the standard node, except the high bits of the resulting integer
-  // are known 0.
-  FP_TO_FP16,
-
-  /// This node is for VLIW targets and it is used to represent a vector
-  /// that is stored in consecutive registers with the same channel.
-  /// For example:
-  ///   |X  |Y|Z|W|
-  /// T0|v.x| | | |
-  /// T1|v.y| | | |
-  /// T2|v.z| | | |
-  /// T3|v.w| | | |
-  BUILD_VERTICAL_VECTOR,
-  /// Pointer to the start of the shader's constant data.
-  CONST_DATA_PTR,
-  PC_ADD_REL_OFFSET,
-  PC_ADD_REL_OFFSET64,
-  LDS,
-
-  DUMMY_CHAIN,
-
-  FIRST_MEMORY_OPCODE,
-  LOAD_D16_HI = FIRST_MEMORY_OPCODE,
-  LOAD_D16_LO,
-  LOAD_D16_HI_I8,
-  LOAD_D16_HI_U8,
-  LOAD_D16_LO_I8,
-  LOAD_D16_LO_U8,
-
-  STORE_MSKOR,
-  TBUFFER_STORE_FORMAT,
-  TBUFFER_STORE_FORMAT_D16,
-  TBUFFER_LOAD_FORMAT,
-  TBUFFER_LOAD_FORMAT_D16,
-  DS_ORDERED_COUNT,
-  ATOMIC_CMP_SWAP,
-  BUFFER_LOAD,
-  BUFFER_LOAD_UBYTE,
-  BUFFER_LOAD_USHORT,
-  BUFFER_LOAD_BYTE,
-  BUFFER_LOAD_SHORT,
-  BUFFER_LOAD_TFE,
-  BUFFER_LOAD_UBYTE_TFE,
-  BUFFER_LOAD_USHORT_TFE,
-  BUFFER_LOAD_BYTE_TFE,
-  BUFFER_LOAD_SHORT_TFE,
-  BUFFER_LOAD_FORMAT,
-  BUFFER_LOAD_FORMAT_TFE,
-  BUFFER_LOAD_FORMAT_D16,
-  SBUFFER_LOAD,
-  SBUFFER_LOAD_BYTE,
-  SBUFFER_LOAD_UBYTE,
-  SBUFFER_LOAD_SHORT,
-  SBUFFER_LOAD_USHORT,
-  SBUFFER_PREFETCH_DATA,
-  BUFFER_STORE,
-  BUFFER_STORE_BYTE,
-  BUFFER_STORE_SHORT,
-  BUFFER_STORE_FORMAT,
-  BUFFER_STORE_FORMAT_D16,
-  BUFFER_ATOMIC_SWAP,
-  BUFFER_ATOMIC_ADD,
-  BUFFER_ATOMIC_SUB,
-  BUFFER_ATOMIC_SMIN,
-  BUFFER_ATOMIC_UMIN,
-  BUFFER_ATOMIC_SMAX,
-  BUFFER_ATOMIC_UMAX,
-  BUFFER_ATOMIC_AND,
-  BUFFER_ATOMIC_OR,
-  BUFFER_ATOMIC_XOR,
-  BUFFER_ATOMIC_INC,
-  BUFFER_ATOMIC_DEC,
-  BUFFER_ATOMIC_CMPSWAP,
-  BUFFER_ATOMIC_CSUB,
-  BUFFER_ATOMIC_FADD,
-  BUFFER_ATOMIC_FMIN,
-  BUFFER_ATOMIC_FMAX,
-  BUFFER_ATOMIC_COND_SUB_U32,
-  LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
-
-  // Set up a whole wave function.
-  WHOLE_WAVE_SETUP,
-
-  // Return from a whole wave function.
-  WHOLE_WAVE_RETURN,
-};
-
-} // End namespace AMDGPUISD
-
 } // End namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 9a90787..d1b9fb4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -68,10 +68,12 @@ public:
 
   // Get the delay type for a MachineInstr.
   DelayType getDelayType(const MachineInstr &MI) {
-    if (SIInstrInfo::isTRANS(MI))
+    // Non-F64 TRANS instructions use a separate delay type.
+    if (SIInstrInfo::isTRANS(MI) &&
+        !AMDGPU::isDPMACCInstruction(MI.getOpcode()))
       return TRANS;
     // WMMA XDL ops are treated the same as TRANS.
-    if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
+    if (ST->hasGFX1250Insts() && SII->isXDLWMMA(MI))
       return TRANS;
     if (SIInstrInfo::isVALU(MI))
       return VALU;
@@ -221,7 +223,7 @@ public:
   };
 
   // A map from regunits to the delay info for that regunit.
-  struct DelayState : DenseMap<unsigned, DelayInfo> {
+  struct DelayState : DenseMap<MCRegUnit, DelayInfo> {
     // Merge another DelayState into this one by merging the delay info for each
     // regunit.
     void merge(const DelayState &RHS) {
@@ -325,6 +327,13 @@ public:
       for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
                 E = MachineBasicBlock::instr_iterator(MI);
            ++I != E;) {
+        if (I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
+          // It is not deterministic whether the skip count counts
+          // S_SET_VGPR_MSB instructions or not, so do not include them in a
+          // skip region.
+          Skip = 6;
+          break;
+        }
         if (!I->isBundle() && !I->isMetaInstruction())
           ++Skip;
       }
@@ -359,7 +368,8 @@ public:
     bool Changed = false;
     MachineInstr *LastDelayAlu = nullptr;
 
-    MCRegUnit LastSGPRFromVALU = 0;
+    // FIXME: 0 is a valid register unit.
+    MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0);
     // Iterate over the contents of bundles, but don't emit any instructions
     // inside a bundle.
     for (auto &MI : MBB.instrs()) {
@@ -379,7 +389,8 @@ public:
         if (It != State.end()) {
           DelayInfo Info = It->getSecond();
           State.advanceByVALUNum(Info.VALUNum);
-          LastSGPRFromVALU = 0;
+          // FIXME: 0 is a valid register unit.
+          LastSGPRFromVALU = static_cast<MCRegUnit>(0);
         }
       }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00..376184e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -35,7 +35,7 @@ struct AMDGPUImageDMaskIntrinsic {
 };
 
 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
-#include "InstCombineTables.inc"
+#include "AMDGPUGenSearchableTables.inc"
 
 } // end anonymous namespace
 
@@ -553,6 +553,89 @@ static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
   return NewCall;
 }
 
+// Return true for sequences of instructions that effectively assign
+// each lane to its thread ID
+static bool isThreadID(const GCNSubtarget &ST, Value *V) {
+  // Case 1:
+  //   wave32: mbcnt_lo(-1, 0)
+  //   wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
+  auto W32Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_ConstantInt<-1>(),
+                                                         m_ConstantInt<0>());
+  auto W64Pred = m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
+      m_ConstantInt<-1>(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
+                               m_ConstantInt<-1>(), m_ConstantInt<0>()));
+  if (ST.isWave32() && match(V, W32Pred))
+    return true;
+  if (ST.isWave64() && match(V, W64Pred))
+    return true;
+
+  return false;
+}
+
+// Attempt to capture situations where the index argument matches
+// a DPP pattern, and convert to a DPP-based mov
+static std::optional<Instruction *>
+tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
+  Value *Val = II.getArgOperand(0);
+  Value *Idx = II.getArgOperand(1);
+  auto &B = IC.Builder;
+
+  // DPP16 Row Share requires known wave size, architecture support
+  if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare())
+    return std::nullopt;
+
+  Value *Tid;
+  uint64_t Mask;
+  uint64_t RowIdx;
+  bool CanDPP16RowShare = false;
+
+  // wave32 requires Mask & 0x1F == 0x10
+  // wave64 requires Mask & 0x3F == 0x30
+  uint64_t MaskCheck = (1UL << ST.getWavefrontSizeLog2()) - 1;
+  uint64_t MaskTarget = MaskCheck & 0xF0;
+
+  // DPP16 Row Share 0: Idx = Tid & Mask
+  auto RowShare0Pred = m_And(m_Value(Tid), m_ConstantInt(Mask));
+
+  // DPP16 Row Share (0 < Row < 15): Idx = (Tid & Mask) | RowIdx
+  auto RowSharePred =
+      m_Or(m_And(m_Value(Tid), m_ConstantInt(Mask)), m_ConstantInt(RowIdx));
+
+  // DPP16 Row Share 15: Idx = Tid | 0xF
+  auto RowShare15Pred = m_Or(m_Value(Tid), m_ConstantInt<0xF>());
+
+  if (match(Idx, RowShare0Pred) && isThreadID(ST, Tid)) {
+    if ((Mask & MaskCheck) != MaskTarget)
+      return std::nullopt;
+
+    RowIdx = 0;
+    CanDPP16RowShare = true;
+  } else if (match(Idx, RowSharePred) && isThreadID(ST, Tid) && RowIdx < 15 &&
+             RowIdx > 0) {
+    if ((Mask & MaskCheck) != MaskTarget)
+      return std::nullopt;
+
+    CanDPP16RowShare = true;
+  } else if (match(Idx, RowShare15Pred) && isThreadID(ST, Tid)) {
+    RowIdx = 15;
+    CanDPP16RowShare = true;
+  }
+
+  if (CanDPP16RowShare) {
+    CallInst *UpdateDPP =
+        B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Val->getType(),
+                          {PoisonValue::get(Val->getType()), Val,
+                           B.getInt32(AMDGPU::DPP::ROW_SHARE0 | RowIdx),
+                           B.getInt32(0xF), B.getInt32(0xF), B.getFalse()});
+    UpdateDPP->takeName(&II);
+    UpdateDPP->copyMetadata(II);
+    return IC.replaceInstUsesWith(II, UpdateDPP);
+  }
+
+  // No valid DPP detected
+  return std::nullopt;
+}
+
 Instruction *
 GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
                                              IntrinsicInst &II) const {
@@ -788,7 +871,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
         Exp = 0;
 
-      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
+      return IC.replaceInstUsesWith(II,
+                                    ConstantInt::getSigned(II.getType(), Exp));
     }
 
     if (isa<PoisonValue>(Src))
@@ -1458,30 +1542,30 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    if (isa<UndefValue>(Src)) {
-      auto *QNaN = ConstantFP::get(
-          II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
-      return IC.replaceInstUsesWith(II, QNaN);
-    }
+    if (isa<UndefValue>(Segment))
+      return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
 
-    const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
-    if (!Csrc)
+    if (II.isStrictFP())
       break;
 
-    if (II.isStrictFP())
+    const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
+    if (!CSrc && !isa<UndefValue>(Src))
       break;
 
-    const APFloat &Fsrc = Csrc->getValueAPF();
-    if (Fsrc.isNaN()) {
-      auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
-      return IC.replaceInstUsesWith(II, Quieted);
-    }
+    // The instruction ignores special cases, and literally just extracts the
+    // exponents. Fold undef to nan, and index the table as normal.
+    APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
+                         : APFloat::getQNaN(II.getType()->getFltSemantics())
+                               .bitcastToAPInt();
 
     const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
-    if (!Cseg)
+    if (!Cseg) {
+      if (isa<UndefValue>(Src))
+        return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
       break;
+    }
 
-    unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
+    unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
     unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
     unsigned Shift = SegmentVal * 53;
     if (Exponent > 1077)
@@ -1737,6 +1821,33 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     NewII->takeName(&II);
     return IC.replaceInstUsesWith(II, NewII);
   }
+  case Intrinsic::amdgcn_tensor_load_to_lds:
+  case Intrinsic::amdgcn_tensor_store_from_lds: {
+    Value *D2 = II.getArgOperand(2);
+    Value *D3 = II.getArgOperand(3);
+    // We know that not passing the second and third tensor DMA groups is
+    // equivalent to passing zeroes for those registers, so we rewrite to the
+    // shorter form here. Undef or poison are replaced by 0.
+    auto Pred = m_CombineOr(m_Zero(), m_Undef());
+    if (!match(D2, Pred) || !match(D3, Pred))
+      return std::nullopt;
+
+    auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds
+                              ? Intrinsic::amdgcn_tensor_load_to_lds_d2
+                              : Intrinsic::amdgcn_tensor_store_from_lds_d2;
+    CallInst *NewII = IC.Builder.CreateIntrinsic(
+        ShortIntrinsic,
+        {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)});
+    NewII->takeName(&II);
+    NewII->copyMetadata(II);
+    return IC.eraseInstFromFunction(II);
+  }
+  case Intrinsic::amdgcn_wave_shuffle: {
+    if (!ST->hasDPP())
+      return std::nullopt;
+
+    return tryWaveShuffleDPP(*ST, IC, II);
+  }
   }
   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 5085e86..2b1f404 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -29,11 +29,19 @@ Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) {
 // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
 bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) {
   const Value *Ptr = MMO->getValue();
+  if (!Ptr) {
+    if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+      return PSV->isConstantPool() || PSV->isStack() || PSV->isGOT() ||
+             PSV->isJumpTable();
+    }
+
+    // Unknown value.
+    return false;
+  }
+
   // UndefValue means this is a load of a kernel input.  These are uniform.
   // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue, Constant, GlobalValue>(Ptr))
+  if (isa<UndefValue, Constant, GlobalValue>(Ptr))
     return true;
 
   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 0eb00cb..529da8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
 struct ImageDimIntrinsicInfo {
   unsigned Intr;
   unsigned BaseOpcode;
+  unsigned AtomicNoRetBaseOpcode;
   MIMGDim Dim;
 
   uint8_t NumOffsetArgs;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b8fa6f3..8dc5d45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -62,6 +62,7 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
 // AMDGPU DAG Nodes
 //
 
+// Masked control flow nodes.
 def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
 def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
 def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
@@ -114,6 +115,7 @@ def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
     [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPOptInGlue]
 >;
 
+// Pointer to the start of the shader's constant data.
 def AMDGPUconstdata_ptr : SDNode<
   "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
                                                      SDTCisVT<0, iPTR>]>
@@ -122,18 +124,21 @@ def AMDGPUconstdata_ptr : SDNode<
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
+// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+// Denormals handled on some parts.
 def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
 def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
 // out = a - floor(a)
 def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
 
 // out = 1.0 / a
 def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
 
-// v_log_f32, which is log2
+// v_log_f32, which is log2, no denormal handling for f32.
 def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>;
 
-// v_exp_f32, which is exp2
+// v_exp_f32, which is exp2, no denormal handling for f32.
 def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>;
 
 // out = 1.0 / sqrt(a)
@@ -146,11 +151,16 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
 def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
+// Convert two float 32 numbers into a single register holding two packed f16
+// with round to zero.
 def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
 def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
 def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
 def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
 def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
+
+// Same as the standard node, except the high bits of the resulting integer
+// are known 0.
 def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
 
 
@@ -225,14 +235,18 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc
   SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
 ]>;
 
+// This is SETCC with the full mask result which is used for a compare with a
+// result bit per item in the wavefront.
 def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
 
+// FP ops with input and output chain.
 def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
    SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
 def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
   SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+// These cvt_f32_ubyte* nodes need to remain consecutive and in order.
 def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
   SDTIntToFPOp, []>;
 def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
@@ -264,6 +278,8 @@ def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
 // Denominator, src2 = Numerator).
 def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
 
+// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
+// treated as an illegal operation.
 def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
 
 def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
@@ -290,14 +306,23 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
                             [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                              SDNPMemOperand]>;
 
+// Extract range of bits with zero extension to 32-bits.
 def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+
+// Extract range of bits with sign extension to 32-bits.
 def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+
+// (src0 & src1) | (~src0 & src2)
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+
+// Insert a range of bits into a 32-bit word.
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
+// ctlz with -1 if input is zero.
 def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
 def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
 
+// cttz with -1 if input is zero.
 def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
 
 // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
@@ -377,6 +402,15 @@ def AMDGPUExportOp : SDTypeProfile<0, 8, [
 
 ]>;
 
+def AMDGPUflat_load_monitor : SDNode<
+  "AMDGPUISD::FLAT_LOAD_MONITOR", SDTLoad,
+  [SDNPHasChain, SDNPMemOperand]
+>;
+
+def AMDGPUglobal_load_monitor : SDNode<
+  "AMDGPUISD::GLOBAL_LOAD_MONITOR", SDTLoad,
+  [SDNPHasChain, SDNPMemOperand]
+>;
 
 //===----------------------------------------------------------------------===//
 // Flow Control Profile Types
@@ -394,16 +428,24 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 //===----------------------------------------------------------------------===//
 // Call/Return DAG Nodes
 //===----------------------------------------------------------------------===//
+
+// A uniform kernel return that terminates the wavefront.
 def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
     [SDNPHasChain, SDNPOptInGlue]>;
+
+// s_endpgm, but we may want to insert it in the middle of the block.
 def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
     [SDNPHasChain]>;
+
+// "s_trap 2" equivalent on hardware that does not support it.
 def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone,
     [SDNPHasChain]>;
 
+// Return to a shader part's epilog code.
 def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
+// Return with values from a non-entry function.
 def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone,
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c73..82783dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
   if (!DstRC || DstRC != SrcRC)
     return false;
 
-  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
-         RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+  if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
+      !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
+    return false;
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -221,14 +227,22 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
   const DebugLoc &DL = I.getDebugLoc();
   MachineBasicBlock *BB = I.getParent();
+  Register VCCReg = I.getOperand(1).getReg();
+  MachineInstr *Cmp;
+
+  // Set SCC as a side effect with S_CMP or S_OR.
+  if (STI.hasScalarCompareEq64()) {
+    unsigned CmpOpc =
+        STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
+    Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
+  } else {
+    Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
+              .addReg(VCCReg)
+              .addReg(VCCReg);
+  }
 
-  unsigned CmpOpc =
-      STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
-  MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
-                          .addReg(I.getOperand(1).getReg())
-                          .addImm(0);
-  if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
-    return false;
+  constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
 
   Register DstReg = I.getOperand(0).getReg();
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
@@ -269,7 +283,8 @@ bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
                              .addImm(0);
 
   I.eraseFromParent();
-  return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
@@ -283,7 +298,8 @@ bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
                  .addReg(SrcReg);
 
   I.eraseFromParent();
-  return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
@@ -350,7 +366,7 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
     Register Reg = MO.getReg();
     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
-            .addReg(Reg, 0, ComposedSubIdx);
+        .addReg(Reg, {}, ComposedSubIdx);
 
     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
                                      MO.isKill(), MO.isDead(), MO.isUndef(),
@@ -400,10 +416,11 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
 
   // Dead implicit-def of scc
   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
-                                         true, // isImp
-                                         false, // isKill
-                                         true)); // isDead
-  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+                                         true,              // isImp
+                                         false,             // isKill
+                                         true));            // isDead
+  constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
@@ -429,15 +446,17 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
         .add(I.getOperand(2))
         .setOperandDead(3); // Dead scc
       I.eraseFromParent();
-      return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+      return true;
     }
 
-    if (STI.hasAddNoCarry()) {
+    if (STI.hasAddNoCarryInsts()) {
       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
       I.setDesc(TII.get(Opc));
       I.addOperand(*MF, MachineOperand::CreateImm(0));
       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
-      return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+      return true;
     }
 
     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
@@ -450,7 +469,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
       .add(I.getOperand(2))
       .addImm(0);
     I.eraseFromParent();
-    return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
+    return true;
   }
 
   assert(!Sub && "illegal sub should not reach here");
@@ -491,8 +511,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
       .addReg(CarryReg, RegState::Kill)
       .addImm(0);
 
-    if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
-      return false;
+    constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
   }
 
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
@@ -528,7 +547,8 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
     I.addOperand(*MF, MachineOperand::CreateImm(0));
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    return true;
   }
 
   Register Src0Reg = I.getOperand(2).getReg();
@@ -593,7 +613,9 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
   I.setDesc(TII.get(Opc));
   I.addOperand(*MF, MachineOperand::CreateImm(0));
   I.addImplicitDefUseOperands(*MF);
-  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  I.getOperand(0).setIsEarlyClobber(true);
+  constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  return true;
 }
 
 // TODO: We should probably legalize these to only using 32-bit results.
@@ -636,7 +658,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
                                     *SrcRC, I.getOperand(1));
   const DebugLoc &DL = I.getDebugLoc();
   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
-    .addReg(SrcReg, 0, SubReg);
+      .addReg(SrcReg, {}, SubReg);
 
   I.eraseFromParent();
   return true;
@@ -709,7 +731,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   for (int I = 0, E = NumDst; I != E; ++I) {
     MachineOperand &Dst = MI.getOperand(I);
     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
-      .addReg(SrcReg, 0, SubRegs[I]);
+        .addReg(SrcReg, {}, SubRegs[I]);
 
     // Make sure the subregister index is valid for the source register.
     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
@@ -809,15 +831,13 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
     auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
                    .addImm(0xFFFF)
                    .addReg(Src0);
-    if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
-      return false;
+    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 
     MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
               .addReg(Src1)
               .addImm(16)
               .addReg(TmpReg);
-    if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
-      return false;
+    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 
     MI.eraseFromParent();
     return true;
@@ -863,7 +883,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
                      .setOperandDead(3); // Dead scc
 
       MI.eraseFromParent();
-      return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+      return true;
     }
     if (STI.hasSPackHL()) {
       Opc = AMDGPU::S_PACK_HL_B32_B16;
@@ -872,7 +893,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
   }
 
   MI.setDesc(TII.get(Opc));
-  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
@@ -970,7 +992,8 @@ bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
                  .addReg(OffsetReg)
                  .addReg(WidthReg);
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
@@ -1072,7 +1095,8 @@ bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
   MIB.addReg(VDstIn);
 
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 // We need to handle this here because tablegen doesn't support matching
@@ -1113,7 +1137,8 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
     .addImm(0);    // $omod
 
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
@@ -1200,6 +1225,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
   case Intrinsic::amdgcn_permlane16_swap:
   case Intrinsic::amdgcn_permlane32_swap:
     return selectPermlaneSwapIntrin(I, IntrinsicID);
+  case Intrinsic::amdgcn_wave_shuffle:
+    return selectWaveShuffleIntrin(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -1468,8 +1495,8 @@ bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
             .add(I.getOperand(3));
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
       .addReg(AMDGPU::SCC);
+    constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
     bool Ret =
-        constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
     I.eraseFromParent();
     return Ret;
@@ -1499,9 +1526,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
 
   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
                                *TRI.getBoolRC(), *MRI);
-  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
   I.eraseFromParent();
-  return Ret;
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
@@ -1555,8 +1582,7 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
     SelectedMI.addImm(0); // op_sel
 
   RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
-  if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
-    return false;
+  constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
 
   I.eraseFromParent();
   return true;
@@ -1642,8 +1668,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
                      .addReg(SrcReg)
                      .addReg(TRI.getExec())
                      .setOperandDead(3); // Dead scc
-      if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
-        return false;
+      constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
     }
   }
 
@@ -1710,7 +1735,8 @@ bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
   }
 
   I.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
@@ -1834,9 +1860,9 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
     return false;
 
-  bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
   MI.eraseFromParent();
-  return Ret;
+  return true;
 }
 
 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
@@ -1930,20 +1956,52 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
   // offset field) % 64. Some versions of the programming guide omit the m0
   // part, or claim it's from offset 0.
-  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
+
+  unsigned Opc = gwsIntrinToOpcode(IID);
+  const MCInstrDesc &InstrDesc = TII.get(Opc);
 
   if (HasVSrc) {
     Register VSrc = MI.getOperand(1).getReg();
-    MIB.addReg(VSrc);
 
-    if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
-      return false;
-  }
+    int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+    const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
+    const TargetRegisterClass *SubRC =
+        TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
 
-  MIB.addImm(ImmOffset)
-     .cloneMemRefs(MI);
+    if (!SubRC) {
+      // 32-bit normal case.
+      if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
+        return false;
 
-  TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
+      BuildMI(*MBB, &MI, DL, InstrDesc)
+        .addReg(VSrc)
+        .addImm(ImmOffset)
+        .cloneMemRefs(MI);
+    } else {
+      // Requires even register alignment, so create 64-bit value and pad the
+      // top half with undef.
+      Register DataReg = MRI->createVirtualRegister(DataRC);
+      if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
+        return false;
+
+      Register UndefReg = MRI->createVirtualRegister(SubRC);
+      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
+      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
+        .addReg(VSrc)
+        .addImm(AMDGPU::sub0)
+        .addReg(UndefReg)
+        .addImm(AMDGPU::sub1);
+
+      BuildMI(*MBB, &MI, DL, InstrDesc)
+        .addReg(DataReg)
+        .addImm(ImmOffset)
+        .cloneMemRefs(MI);
+    }
+  } else {
+    BuildMI(*MBB, &MI, DL, InstrDesc)
+      .addImm(ImmOffset)
+      .cloneMemRefs(MI);
+  }
 
   MI.eraseFromParent();
   return true;
@@ -1978,11 +2036,12 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
     .addImm(IsGDS ? -1 : 0)
     .cloneMemRefs(MI);
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
-  MachineFunction *MF = MI.getParent()->getParent();
+  MachineFunction *MF = MI.getMF();
   SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
 
   MFInfo->setInitWholeWave();
@@ -2006,19 +2065,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
   MachineBasicBlock *MBB = MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
+  unsigned IntrOpcode = Intr->BaseOpcode;
+
+  // For image atomic: use no-return opcode if result is unused.
+  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
+    Register ResultDef = MI.getOperand(0).getReg();
+    if (MRI->use_nodbg_empty(ResultDef))
+      IntrOpcode = Intr->AtomicNoRetBaseOpcode;
+  }
 
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-    AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
 
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  unsigned IntrOpcode = Intr->BaseOpcode;
   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
 
   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
 
-  Register VDataIn, VDataOut;
+  Register VDataIn = AMDGPU::NoRegister;
+  Register VDataOut = AMDGPU::NoRegister;
   LLT VDataTy;
   int NumVDataDwords = -1;
   bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
@@ -2049,7 +2116,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   unsigned DMaskLanes = 0;
 
   if (BaseOpcode->Atomic) {
-    VDataOut = MI.getOperand(0).getReg();
+    if (!BaseOpcode->NoReturn)
+      VDataOut = MI.getOperand(0).getReg();
     VDataIn = MI.getOperand(2).getReg();
     LLT Ty = MRI->getType(VDataIn);
 
@@ -2099,8 +2167,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
 
   unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+    CPol |= AMDGPU::CPol::GLC;
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return false;
@@ -2280,7 +2349,8 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
                  .cloneMemRefs(MI);
 
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
@@ -2306,17 +2376,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   case Intrinsic::amdgcn_init_whole_wave:
     return selectInitWholeWave(I);
   case Intrinsic::amdgcn_raw_buffer_load_lds:
+  case Intrinsic::amdgcn_raw_buffer_load_async_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
   case Intrinsic::amdgcn_struct_buffer_load_lds:
+  case Intrinsic::amdgcn_struct_buffer_load_async_lds:
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
     return selectBufferLoadLds(I);
   // Until we can store both the address space of the global and the LDS
   // arguments by having tto MachineMemOperands on an intrinsic, we just trust
   // that the argument is a global pointer (buffer pointers have been handled by
   // a LLVM IR-level lowering).
   case Intrinsic::amdgcn_load_to_lds:
+  case Intrinsic::amdgcn_load_async_to_lds:
   case Intrinsic::amdgcn_global_load_lds:
+  case Intrinsic::amdgcn_global_load_async_lds:
     return selectGlobalLoadLds(I);
+  case Intrinsic::amdgcn_asyncmark:
+  case Intrinsic::amdgcn_wait_asyncmark:
+    // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
+    if (!Subtarget->hasVMemToLDSLoad())
+      return false;
+    break;
   case Intrinsic::amdgcn_exp_compr:
     if (!STI.hasCompressedExport()) {
       Function &F = I.getMF()->getFunction();
@@ -2331,9 +2413,35 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
     return selectDSBvhStackIntrinsic(I);
+  case Intrinsic::amdgcn_s_alloc_vgpr: {
+    // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
+    // SCC. We then need to COPY it into the result vreg.
+    MachineBasicBlock *MBB = I.getParent();
+    const DebugLoc &DL = I.getDebugLoc();
+
+    Register ResReg = I.getOperand(0).getReg();
+
+    MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
+                                .add(I.getOperand(2));
+    (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
+        .addReg(AMDGPU::SCC);
+    I.eraseFromParent();
+    constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
+    return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
+  }
   case Intrinsic::amdgcn_s_barrier_init:
   case Intrinsic::amdgcn_s_barrier_signal_var:
     return selectNamedBarrierInit(I, IntrinsicID);
+  case Intrinsic::amdgcn_s_wakeup_barrier: {
+    if (!STI.hasSWakeupBarrier()) {
+      Function &F = I.getMF()->getFunction();
+      F.getContext().diagnose(
+          DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
+                                    I.getDebugLoc(), DS_Error));
+      return false;
+    }
+    return selectNamedBarrierInst(I, IntrinsicID);
+  }
   case Intrinsic::amdgcn_s_barrier_join:
   case Intrinsic::amdgcn_s_get_named_barrier_state:
     return selectNamedBarrierInst(I, IntrinsicID);
@@ -2372,11 +2480,10 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
             .add(I.getOperand(2))
             .add(I.getOperand(3));
 
-    bool Ret = false;
-    Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
-    Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
     I.eraseFromParent();
-    return Ret;
+    return true;
   }
 
   // Wide VGPR select should have been split in RegBankSelect.
@@ -2391,9 +2498,9 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
               .add(I.getOperand(2))
               .add(I.getOperand(1));
 
-  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
   I.eraseFromParent();
-  return Ret;
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
@@ -2438,7 +2545,7 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     const DebugLoc &DL = I.getDebugLoc();
     MachineBasicBlock *MBB = I.getParent();
     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
-        .addReg(SrcReg, 0, AMDGPU::lo16);
+        .addReg(SrcReg, {}, AMDGPU::lo16);
     I.eraseFromParent();
     return true;
   }
@@ -2450,9 +2557,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     Register LoReg = MRI->createVirtualRegister(DstRC);
     Register HiReg = MRI->createVirtualRegister(DstRC);
     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
-      .addReg(SrcReg, 0, AMDGPU::sub0);
+        .addReg(SrcReg, {}, AMDGPU::sub0);
     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
-      .addReg(SrcReg, 0, AMDGPU::sub1);
+        .addReg(SrcReg, {}, AMDGPU::sub1);
 
     if (IsVALU && STI.hasSDWA()) {
       // Write the low 16-bits of the high element into the high 16-bits of the
@@ -2609,7 +2716,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
         .addImm(Mask)
         .addReg(SrcReg);
       I.eraseFromParent();
-      return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+      return true;
     }
 
     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
@@ -2619,7 +2727,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
       .addImm(0) // Offset
       .addImm(SrcSize); // Width
     I.eraseFromParent();
-    return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+    return true;
   }
 
   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
@@ -2644,18 +2753,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
       unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
       if (Signed) {
         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
-          .addReg(SrcReg, 0, SubReg)
-          .addImm(31)
-          .setOperandDead(3); // Dead scc
+            .addReg(SrcReg, {}, SubReg)
+            .addImm(31)
+            .setOperandDead(3); // Dead scc
       } else {
         BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
           .addImm(0);
       }
       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-        .addReg(SrcReg, 0, SubReg)
-        .addImm(AMDGPU::sub0)
-        .addReg(HiReg)
-        .addImm(AMDGPU::sub1);
+          .addReg(SrcReg, {}, SubReg)
+          .addImm(AMDGPU::sub0)
+          .addReg(HiReg)
+          .addImm(AMDGPU::sub1);
       I.eraseFromParent();
       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
                                           *MRI);
@@ -2673,10 +2782,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
 
       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
-        .addReg(SrcReg, 0, SubReg)
-        .addImm(AMDGPU::sub0)
-        .addReg(UndefReg)
-        .addImm(AMDGPU::sub1);
+          .addReg(SrcReg, {}, SubReg)
+          .addImm(AMDGPU::sub0)
+          .addReg(UndefReg)
+          .addImm(AMDGPU::sub1);
 
       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
         .addReg(ExtReg)
@@ -2810,9 +2919,9 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
-    .addReg(Src, 0, AMDGPU::sub0);
+      .addReg(Src, {}, AMDGPU::sub0);
   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
-    .addReg(Src, 0, AMDGPU::sub1);
+      .addReg(Src, {}, AMDGPU::sub1);
   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
     .addImm(0x80000000);
 
@@ -2852,9 +2961,9 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
     return false;
 
   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
-    .addReg(Src, 0, AMDGPU::sub0);
+      .addReg(Src, {}, AMDGPU::sub0);
   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
-    .addReg(Src, 0, AMDGPU::sub1);
+      .addReg(Src, {}, AMDGPU::sub1);
   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
     .addImm(0x7fffffff);
 
@@ -3093,7 +3202,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
       .addReg(MaskReg)
       .setOperandDead(3); // Dead scc
     I.eraseFromParent();
-    return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+    return true;
   }
 
   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
@@ -3129,9 +3239,9 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
 
   // Extract the subregisters from the source pointer.
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
-    .addReg(SrcReg, 0, AMDGPU::sub0);
+      .addReg(SrcReg, {}, AMDGPU::sub0);
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
-    .addReg(SrcReg, 0, AMDGPU::sub1);
+      .addReg(SrcReg, {}, AMDGPU::sub1);
 
   Register MaskedLo, MaskedHi;
 
@@ -3144,7 +3254,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
     MaskedLo = MRI->createVirtualRegister(&RegRC);
 
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
-      .addReg(MaskReg, 0, AMDGPU::sub0);
+        .addReg(MaskReg, {}, AMDGPU::sub0);
     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
       .addReg(LoReg)
       .addReg(MaskLo);
@@ -3158,7 +3268,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
     MaskedHi = MRI->createVirtualRegister(&RegRC);
 
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
-      .addReg(MaskReg, 0, AMDGPU::sub1);
+        .addReg(MaskReg, {}, AMDGPU::sub1);
     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
       .addReg(HiReg)
       .addReg(MaskHi);
@@ -3246,8 +3356,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
 
     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
-      .addReg(SrcReg, 0, SubReg)
-      .addReg(SrcReg, RegState::Implicit);
+        .addReg(SrcReg, {}, SubReg)
+        .addReg(SrcReg, RegState::Implicit);
     MI.eraseFromParent();
     return true;
   }
@@ -3259,8 +3369,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
       .addReg(IdxReg);
     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
-      .addReg(SrcReg, 0, SubReg)
-      .addReg(SrcReg, RegState::Implicit);
+        .addReg(SrcReg, {}, SubReg)
+        .addReg(SrcReg, RegState::Implicit);
     MI.eraseFromParent();
     return true;
   }
@@ -3350,11 +3460,25 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
   return true;
 }
 
+static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
+  switch (Intr) {
+  case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
+  case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
+  case Intrinsic::amdgcn_load_async_to_lds:
+  case Intrinsic::amdgcn_global_load_async_lds:
+    return true;
+  }
+  return false;
+}
+
 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
   if (!Subtarget->hasVMemToLDSLoad())
     return false;
   unsigned Opc;
   unsigned Size = MI.getOperand(3).getImm();
+  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
 
   // The struct intrinsic variants add one additional operand over raw.
   const bool HasVIndex = MI.getNumOperands() == 9;
@@ -3444,12 +3568,17 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
       Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
           ? 1
           : 0); // swz
+  MIB.addImm(isAsyncLDSDMA(IntrinsicID));
 
   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+  // Don't set the offset value here because the pointer points to the base of
+  // the buffer.
   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-  LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+
   MachinePointerInfo StorePtrI = LoadPtrI;
-  StorePtrI.V = nullptr;
+  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+                                                 AMDGPUAS::BUFFER_RESOURCE));
+  LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
   auto F = LoadMMO->getFlags() &
@@ -3464,7 +3593,8 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
   MIB.setMemRefs({LoadMMO, StoreMMO});
 
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 /// Match a zero extend from a 32-bit value to 64-bits.
@@ -3561,6 +3691,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
 
   unsigned Opc;
   unsigned Size = MI.getOperand(3).getImm();
+  Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
 
   switch (Size) {
   default:
@@ -3627,13 +3758,18 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   if (isSGPR(Addr))
     MIB.addReg(VOffset);
 
-  MIB.add(MI.getOperand(4))  // offset
-     .add(MI.getOperand(5)); // cpol
+  MIB.add(MI.getOperand(4)); // offset
+
+  unsigned Aux = MI.getOperand(5).getImm();
+  MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
+  MIB.addImm(isAsyncLDSDMA(IntrinsicID));
 
   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
   LoadPtrI.Offset = MI.getOperand(4).getImm();
   MachinePointerInfo StorePtrI = LoadPtrI;
+  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+                                                 AMDGPUAS::GLOBAL_ADDRESS));
   LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
   auto F = LoadMMO->getFlags() &
@@ -3647,7 +3783,8 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   MIB.setMemRefs({LoadMMO, StoreMMO});
 
   MI.eraseFromParent();
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
@@ -3656,8 +3793,9 @@ bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
       MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
   MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
   MI.removeOperand(OpcodeOpIdx);
-  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
-  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+  MI.addImplicitDefUseOperands(*MI.getMF());
+  constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+  return true;
 }
 
 // FIXME: This should be removed and let the patterns select. We just need the
@@ -3759,7 +3897,11 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
   MI.removeOperand(4); // VDst_In
   MI.removeOperand(1); // Intrinsic ID
   MI.addOperand(VDst_In); // Readd VDst_In to the end
-  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+  MI.addImplicitDefUseOperands(*MI.getMF());
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
   return true;
 }
 
@@ -3783,7 +3925,8 @@ bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
   MachineOperand &FI = MI.getOperand(4);
   FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0);
 
-  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
@@ -3814,6 +3957,133 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
+    MachineInstr &MI) const {
+  assert(MI.getNumOperands() == 4);
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register ValReg = MI.getOperand(2).getReg();
+  Register IdxReg = MI.getOperand(3).getReg();
+
+  const LLT DstTy = MRI->getType(DstReg);
+  unsigned DstSize = DstTy.getSizeInBits();
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const TargetRegisterClass *DstRC =
+      TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
+
+  if (DstTy != LLT::scalar(32))
+    return false;
+
+  if (!Subtarget->supportsBPermute())
+    return false;
+
+  // If we can bpermute across the whole wave, then just do that
+  if (Subtarget->supportsWaveWideBPermute()) {
+    Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+        .addImm(2)
+        .addReg(IdxReg);
+
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
+        .addReg(ShiftIdxReg)
+        .addReg(ValReg)
+        .addImm(0);
+  } else {
+    // Otherwise, we need to make use of whole wave mode
+    assert(Subtarget->isWave64());
+
+    // Set inactive lanes to poison
+    Register UndefValReg =
+        MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
+
+    Register UndefExecReg = MRI->createVirtualRegister(
+        TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
+
+    Register PoisonValReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
+        .addImm(0)
+        .addReg(ValReg)
+        .addImm(0)
+        .addReg(UndefValReg)
+        .addReg(UndefExecReg);
+
+    // ds_bpermute requires index to be multiplied by 4
+    Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
+        .addImm(2)
+        .addReg(IdxReg);
+
+    Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
+        .addImm(0)
+        .addReg(ShiftIdxReg)
+        .addImm(0)
+        .addReg(UndefValReg)
+        .addReg(UndefExecReg);
+
+    // Get permutation of each half, then we'll select which one to use
+    Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
+        .addReg(PoisonIdxReg)
+        .addReg(PoisonValReg)
+        .addImm(0);
+
+    Register SwappedValReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
+        .addReg(PoisonValReg);
+
+    Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
+        .addReg(PoisonIdxReg)
+        .addReg(SwappedValReg)
+        .addImm(0);
+
+    Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
+        .addReg(OppSidePermReg);
+
+    // Select which side to take the permute from
+    // We can get away with only using mbcnt_lo here since we're only
+    // trying to detect which side of 32 each lane is on, and mbcnt_lo
+    // returns 32 for lanes 32-63.
+    Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
+        .addImm(-1)
+        .addImm(0);
+
+    Register XORReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
+        .addReg(ThreadIDReg)
+        .addReg(PoisonIdxReg);
+
+    Register ANDReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
+        .addReg(XORReg)
+        .addImm(32);
+
+    Register CompareReg = MRI->createVirtualRegister(
+        TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
+        .addReg(ANDReg)
+        .addImm(0);
+
+    // Finally do the selection
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addImm(0)
+        .addReg(WWMSwapPermReg)
+        .addImm(0)
+        .addReg(SameSidePermReg)
+        .addReg(CompareReg);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // Match BITOP3 operation and return a number of matched instructions plus
 // truth table.
 static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
@@ -3891,7 +4161,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
     SmallVector<Register, 3> Backup(Src.begin(), Src.end());
     if (!getOperandBits(LHS, LHSBits) ||
         !getOperandBits(RHS, RHSBits)) {
-      Src = Backup;
+      Src = std::move(Backup);
       return std::make_pair(0, 0);
     }
 
@@ -4131,6 +4401,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_ATOMICRMW_UMAX:
   case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
   case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+  case TargetOpcode::G_ATOMICRMW_USUB_COND:
+  case TargetOpcode::G_ATOMICRMW_USUB_SAT:
   case TargetOpcode::G_ATOMICRMW_FADD:
   case TargetOpcode::G_ATOMICRMW_FMIN:
   case TargetOpcode::G_ATOMICRMW_FMAX:
@@ -6726,7 +6998,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(2);
   std::optional<int64_t> BarValImm =
       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
 
@@ -6760,6 +7032,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
       llvm_unreachable("not a named barrier op");
     case Intrinsic::amdgcn_s_barrier_join:
       return AMDGPU::S_BARRIER_JOIN_IMM;
+    case Intrinsic::amdgcn_s_wakeup_barrier:
+      return AMDGPU::S_WAKEUP_BARRIER_IMM;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_IMM;
     };
@@ -6769,6 +7043,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
       llvm_unreachable("not a named barrier op");
     case Intrinsic::amdgcn_s_barrier_join:
       return AMDGPU::S_BARRIER_JOIN_M0;
+    case Intrinsic::amdgcn_s_wakeup_barrier:
+      return AMDGPU::S_WAKEUP_BARRIER_M0;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_M0;
     };
@@ -6779,8 +7055,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(1);
-  MachineOperand CntOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(1);
+  const MachineOperand &CntOp = I.getOperand(2);
 
   // BarID = (BarOp >> 4) & 0x3F
   Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c760fe7..627cce2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -156,6 +156,7 @@ private:
   bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectSBarrierLeave(MachineInstr &I) const;
+  bool selectWaveShuffleIntrin(MachineInstr &I) const;
 
   std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
                                                    bool IsCanonicalizing = true,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index bd443b5..f77b4c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op {
     }
 }
 
-defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
 defm int_amdgcn_global_atomic_fmin_num : noret_op;
 defm int_amdgcn_global_atomic_fmax_num : noret_op;
-defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
 
 multiclass noret_binary_atomic_op<SDNode atomic_op> {
   let HasNoUse = true in
@@ -695,6 +691,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
 defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
 defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
 defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
 defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
 
 def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
@@ -745,23 +743,14 @@ int FP32_ONE = 0x3f800000;
 int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
 int FP64_NEG_ONE = 0xbff0000000000000;
+int BF16_ONE = 0x3F80;
+int BF16_NEG_ONE = 0xBF80;
 }
 def CONST : Constants;
 
-def FP_ZERO : PatLeaf <
-  (fpimm),
-  [{return N->getValueAPF().isZero();}]
->;
-
-def FP_ONE : PatLeaf <
-  (fpimm),
-  [{return N->isExactlyValue(1.0);}]
->;
-
-def FP_HALF : PatLeaf <
-  (fpimm),
-  [{return N->isExactlyValue(0.5);}]
->;
+def fpimm_zero : FPImmLeaf<fAny, [{ return Imm.isZero(); }]> ;
+def fpimm_one : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+1.0); }]> ;
+def fpimm_half : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+0.5); }]> ;
 
 /* Generic helper patterns for intrinsics */
 /* -------------------------------------- */
@@ -806,24 +795,17 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
   (vt rc:$addr)
 >;
 
-// rotr pattern
-class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
-  (rotr i32:$src0, i32:$src1),
-  (BIT_ALIGN $src0, $src0, $src1)
->;
-
 // Special conversion patterns
 
-def cvt_rpi_i32_f32 : PatFrag <
+let GIIgnoreCopies = 1 in
+def cvt_rpi_i32_f32 : PatFrag<
   (ops node:$src),
-  (fp_to_sint (ffloor (fadd $src, FP_HALF))),
-  [{ (void) N; return TM.Options.NoNaNsFPMath; }]
->;
+  (fp_to_sint (ffloor_nnan (fadd $src, fpimm_half)))
+>, GISelFlags;
 
-def cvt_flr_i32_f32 : PatFrag <
+def cvt_flr_i32_f32 : PatFrag<
   (ops node:$src),
-  (fp_to_sint (ffloor $src)),
-  [{ (void)N; return TM.Options.NoNaNsFPMath; }]
+  (fp_to_sint (ffloor_nnan $src))
 >;
 
 let AddedComplexity = 2 in {
@@ -841,7 +823,7 @@ class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
 } // AddedComplexity.
 
 class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
-  (fdiv FP_ONE, vt:$src),
+  (fdiv fpimm_one, vt:$src),
   (RcpInst $src)
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
index df80196..95d88c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULaneMaskUtils.h
@@ -31,10 +31,12 @@ public:
   const unsigned AndSaveExecTermOpc;
   const unsigned BfmOpc;
   const unsigned CMovOpc;
+  const unsigned CmpLGOp;
   const unsigned CSelectOpc;
   const unsigned MovOpc;
   const unsigned MovTermOpc;
   const unsigned OrOpc;
+  const unsigned OrN2Op;
   const unsigned OrTermOpc;
   const unsigned OrSaveExecOpc;
   const unsigned XorOpc;
@@ -57,10 +59,12 @@ public:
                                     : AMDGPU::S_AND_SAVEEXEC_B64_term),
         BfmOpc(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
         CMovOpc(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+        CmpLGOp(IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64),
         CSelectOpc(IsWave32 ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64),
         MovOpc(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
         MovTermOpc(IsWave32 ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term),
         OrOpc(IsWave32 ? AMDGPU::S_OR_B32 : AMDGPU::S_OR_B64),
+        OrN2Op(IsWave32 ? AMDGPU::S_ORN2_B32 : AMDGPU::S_ORN2_B64),
         OrTermOpc(IsWave32 ? AMDGPU::S_OR_B32_term : AMDGPU::S_OR_B64_term),
         OrSaveExecOpc(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
                                : AMDGPU::S_OR_SAVEEXEC_B64),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 7504f1a..63e2656 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -126,7 +126,38 @@ public:
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  bool isOpLegal(Instruction *I) { return isa<StoreInst, IntrinsicInst>(I); }
+  bool isOpLegal(const Instruction *I) {
+    if (isa<IntrinsicInst>(I))
+      return true;
+
+    // Any store is a profitable sink (prevents flip-flopping)
+    if (isa<StoreInst>(I))
+      return true;
+
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (auto *VT = dyn_cast<FixedVectorType>(BO->getType())) {
+        if (const auto *IT = dyn_cast<IntegerType>(VT->getElementType())) {
+          unsigned EB = IT->getBitWidth();
+          unsigned EC = VT->getNumElements();
+          // Check for SDWA-compatible operation
+          if ((EB == 8 || EB == 16) && ST.hasSDWA() && EC * EB <= 32) {
+            switch (BO->getOpcode()) {
+            case Instruction::Add:
+            case Instruction::Sub:
+            case Instruction::And:
+            case Instruction::Or:
+            case Instruction::Xor:
+              return true;
+            default:
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    return false;
+  }
 
   bool isCoercionProfitable(Instruction *II) {
     SmallPtrSet<Instruction *, 4> CVisited;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 596a895..5a993a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -30,6 +30,8 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -411,7 +413,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
   switch (AS) {
   case AMDGPUAS::PRIVATE_ADDRESS:
     // FIXME: Private element size.
-    return ST.enableFlatScratch() ? 128 : 32;
+    return ST.hasFlatScratchEnabled() ? 128 : 32;
   case AMDGPUAS::LOCAL_ADDRESS:
     return ST.useDS128() ? 128 : 64;
   case AMDGPUAS::GLOBAL_ADDRESS:
@@ -750,7 +752,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
       .scalarize(0);
 
-  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
+  if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
     // Full set of gfx9 features.
     if (ST.hasScalarAddSub64()) {
       getActionDefinitionsBuilder({G_ADD, G_SUB})
@@ -976,9 +978,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
   }
 
+  auto &MinNumMaxNumIeee =
+      getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+  if (ST.hasVOP3PInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypesPK16)
+        .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+        .clampMaxNumElements(0, S16, 2)
+        .clampScalar(0, S16, S64)
+        .scalarize(0);
+  } else if (ST.has16BitInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
+  } else {
+    MinNumMaxNumIeee.legalFor(FPTypesBase)
+        .clampScalar(0, S32, S64)
+        .scalarize(0);
+  }
+
   auto &MinNumMaxNum = getActionDefinitionsBuilder(
-      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
-       G_FMAXNUM_IEEE});
+      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
 
   if (ST.hasVOP3PInsts()) {
     MinNumMaxNum.customFor(FPTypesPK16)
@@ -1039,6 +1057,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
       .scalarize(0)
       .lower();
+
+    getActionDefinitionsBuilder(G_FMODF)
+        .lowerFor({S16, S32, S64})
+        .scalarize(0)
+        .lower();
   } else {
     getActionDefinitionsBuilder(G_FSQRT)
       .customFor({S32, S64, S16})
@@ -1072,6 +1095,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .minScalar(0, S32)
       .clampScalar(1, S32, S32)
       .lower();
+
+    getActionDefinitionsBuilder(G_FMODF)
+        .lowerFor({S32, S64})
+        .scalarize(0)
+        .lower();
   }
 
   auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
@@ -1171,6 +1199,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
        .scalarize(0)
        .lower();
 
+  // clang-format off
+  auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
+    .legalFor({{S32, S32}, {S32, S64}})
+    .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
+  FPToISat.minScalar(1, S32);
+  FPToISat.minScalar(0, S32)
+       .widenScalarToNextPow2(0, 32)
+       .scalarize(0)
+       .lower();
+  // clang-format on
+
   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
       .clampScalar(0, S16, S64)
       .scalarize(0)
@@ -1705,6 +1744,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
   }
 
+  auto &Atomics32 =
+      getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+          .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+  if (ST.hasFlatAddressSpace()) {
+    Atomics32.legalFor({{S32, FlatPtr}});
+  }
+
   // TODO: v2bf16 operations, and fat buffer pointer support.
   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
   if (ST.hasLDSFPAtomicAddF32()) {
@@ -2136,9 +2182,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         .legalFor(FPTypesPK16)
         .clampMaxNumElements(0, S16, 2)
         .scalarize(0);
+  } else if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .lowerFor({V2S16})
+        .clampMaxNumElementsStrict(0, S16, 2)
+        .scalarize(0)
+        .lower();
   } else {
-    // TODO: Implement
-    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .scalarize(0)
+        .clampScalar(0, S32, S64)
+        .lower();
   }
 
   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2195,8 +2249,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINIMUMNUM:
   case TargetOpcode::G_FMAXIMUMNUM:
-  case TargetOpcode::G_FMINNUM_IEEE:
-  case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, B);
@@ -2299,14 +2351,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
     return B.buildUnmerge(S32, Dst).getReg(1);
   }
 
-  // TODO: can we be smarter about machine pointer info?
-  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   Register LoadAddr = MRI.createGenericVirtualRegister(
     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   // For code object version 5, private_base and shared_base are passed through
   // implicit kernargs.
   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
       AMDGPU::AMDHSA_COV5) {
+    MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+
     AMDGPUTargetLowering::ImplicitParameter Param =
         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
                                       : AMDGPUTargetLowering::PRIVATE_BASE;
@@ -2321,7 +2373,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
       return Register();
 
     MachineMemOperand *MMO = MF.getMachineMemOperand(
-        PtrInfo,
+        PtrInfo.getWithOffset(Offset),
         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
             MachineMemOperand::MOInvariant,
         LLT::scalar(32), commonAlignment(Align(64), Offset));
@@ -2339,6 +2391,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
     return Register();
 
+  // TODO: Use custom PseudoSourceValue
+  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
@@ -2538,8 +2593,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
     auto PtrLo = B.buildPtrToInt(S32, Src);
-    auto HighAddr = B.buildConstant(S32, AddrHiVal);
-    B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
+    if (AddrHiVal == 0) {
+      auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
+      B.buildIntToPtr(Dst, Zext);
+    } else {
+      auto HighAddr = B.buildConstant(S32, AddrHiVal);
+      B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
+    }
+
     MI.eraseFromParent();
     return true;
   }
@@ -2817,23 +2878,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
   MachineFunction &MF = Helper.MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
-                        MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
-  // With ieee_mode disabled, the instructions have the correct behavior
-  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
-  //
-  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
-  // enabled.
-  if (!MFI->getMode().IEEE) {
-    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
-        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
-      return true;
-
-    return !IsIEEEOp;
-  }
-
-  if (IsIEEEOp)
+  // With ieee_mode disabled, the instructions have the correct behavior.
+  if (!MFI->getMode().IEEE)
     return true;
 
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
@@ -3145,16 +3191,16 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
       return true; // Leave in place;
     }
 
+    const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
-      Type *Ty = GV->getValueType();
       // HIP uses an unsized array `extern __shared__ T s[]` or similar
       // zero-sized type in other languages to declare the dynamic shared
       // memory which size is not known at the compile time. They will be
       // allocated by the runtime and placed directly after the static
       // allocated ones. They all share the same offset.
-      if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+      if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
+        MFI->setDynLDSAlign(MF.getFunction(), GVar);
         LLT S32 = LLT::scalar(32);
         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
         B.buildIntToPtr(DstReg, Sz);
@@ -3163,8 +3209,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
       }
     }
 
-    B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
-                                                   *cast<GlobalVariable>(GV)));
+    B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
     MI.eraseFromParent();
     return true;
   }
@@ -3383,6 +3428,10 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
   case TargetOpcode::G_INTRINSIC: {
     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
     case Intrinsic::amdgcn_frexp_mant:
+    case Intrinsic::amdgcn_log:
+    case Intrinsic::amdgcn_log_clamp:
+    case Intrinsic::amdgcn_exp2:
+    case Intrinsic::amdgcn_sqrt:
       return true;
     default:
       break;
@@ -3390,6 +3439,8 @@ static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
 
     break;
   }
+  case TargetOpcode::G_FSQRT:
+    return true;
   case TargetOpcode::G_FFREXP: {
     if (DefMI->getOperand(0).getReg() == Src)
       return true;
@@ -3503,14 +3554,10 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
   Register X = MI.getOperand(1).getReg();
   unsigned Flags = MI.getFlags();
   const LLT Ty = MRI.getType(X);
-  MachineFunction &MF = B.getMF();
 
   const LLT F32 = LLT::scalar(32);
   const LLT F16 = LLT::scalar(16);
 
-  const AMDGPUTargetMachine &TM =
-      static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
-
   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
     if (Ty == F16 && !ST.has16BitInsts()) {
       Register LogVal = MRI.createGenericVirtualRegister(F32);
@@ -3544,12 +3591,14 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
 
     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
-
-    R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
-    auto NegR = B.buildFNeg(Ty, R, Flags);
-    auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
-    auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
-    R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
+    // This adds correction terms for which contraction may lead to an increase
+    // in the error of the approximation, so disable it.
+    auto NewFlags = Flags & ~(MachineInstr::FmContract);
+    R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
+    auto NegR = B.buildFNeg(Ty, R, NewFlags);
+    auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
+    auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
+    R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
   } else {
     // ch+ct is ln(2)/ln(10) to more than 36 bits
     const float ch_log10 = 0x1.344000p-2f;
@@ -3565,17 +3614,19 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
     auto YH = B.buildAnd(Ty, Y, MaskConst);
     auto YT = B.buildFSub(Ty, Y, YH, Flags);
-    auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
+    // This adds correction terms for which contraction may lead to an increase
+    // in the error of the approximation, so disable it.
+    auto NewFlags = Flags & ~(MachineInstr::FmContract);
+    auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
 
     Register Mad0 =
-        getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
-    Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
-    R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
+        getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
+    Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
+    R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
   }
 
   const bool IsFiniteOnly =
-      (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
-      MI.getFlag(MachineInstr::FmNoInfs);
+      MI.getFlag(MachineInstr::FmNoNans) && MI.getFlag(MachineInstr::FmNoInfs);
 
   if (!IsFiniteOnly) {
     // Expand isfinite(x) => fabs(x) < inf
@@ -3699,24 +3750,39 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
   return true;
 }
 
+static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst,
+                                    const SrcOp &Src, unsigned Flags) {
+  LLT Ty = Dst.getLLTTy(*B.getMRI());
+
+  if (Ty == LLT::scalar(32)) {
+    return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
+        .addUse(Src.getReg())
+        .setMIFlags(Flags);
+  }
+  return B.buildFExp2(Dst, Src, Flags);
+}
+
+bool AMDGPULegalizerInfo::legalizeFExpUnsafeImpl(MachineIRBuilder &B,
+                                                 Register Dst, Register X,
+                                                 unsigned Flags,
+                                                 bool IsExp10) const {
+  LLT Ty = B.getMRI()->getType(X);
+
+  // exp(x) -> exp2(M_LOG2E_F * x);
+  // exp10(x) -> exp2(log2(10) * x);
+  auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
+  auto Mul = B.buildFMul(Ty, X, Const, Flags);
+  buildExp(B, Dst, Mul, Flags);
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
                                              Register X, unsigned Flags) const {
   LLT Ty = B.getMRI()->getType(Dst);
   LLT F32 = LLT::scalar(32);
 
   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
-    auto Log2E = B.buildFConstant(Ty, numbers::log2e);
-    auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
-
-    if (Ty == F32) {
-      B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
-        .addUse(Mul.getReg(0))
-        .setMIFlags(Flags);
-    } else {
-      B.buildFExp2(Dst, Mul.getReg(0), Flags);
-    }
-
-    return true;
+    return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
   }
 
   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
@@ -3739,6 +3805,55 @@ bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeFExp10Unsafe(MachineIRBuilder &B,
+                                               Register Dst, Register X,
+                                               unsigned Flags) const {
+  LLT Ty = B.getMRI()->getType(Dst);
+  LLT F32 = LLT::scalar(32);
+
+  if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
+    // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
+    auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+    auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+    auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
+    auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+    auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
+    auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+    B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
+    return true;
+  }
+
+  // bool s = x < -0x1.2f7030p+5f;
+  // x += s ? 0x1.0p+5f : 0.0f;
+  // exp10 = exp2(x * 0x1.a92000p+1f) *
+  //        exp2(x * 0x1.4f0978p-11f) *
+  //        (s ? 0x1.9f623ep-107f : 1.0f);
+
+  auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
+  auto NeedsScaling =
+      B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
+
+  auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
+  auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
+  auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
+
+  auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
+  auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
+
+  auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
+  auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
+  auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
+  auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
+
+  auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
+  auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
+  auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
+
+  B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
                                        MachineIRBuilder &B) const {
   Register Dst = MI.getOperand(0).getReg();
@@ -3755,18 +3870,22 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
     // v_exp_f16 (fmul x, log2e)
     if (allowApproxFunc(MF, Flags)) {
       // TODO: Does this really require fast?
-      legalizeFExpUnsafe(B, Dst, X, Flags);
+      IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
+              : legalizeFExpUnsafe(B, Dst, X, Flags);
       MI.eraseFromParent();
       return true;
     }
 
+    // Nothing in half is a denormal when promoted to f32.
+    //
     // exp(f16 x) ->
     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
-
-    // Nothing in half is a denormal when promoted to f32.
+    //
+    // exp10(f16 x) ->
+    //   fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
     auto Ext = B.buildFPExt(F32, X, Flags);
     Register Lowered = MRI.createGenericVirtualRegister(F32);
-    legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
+    legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
     B.buildFPTrunc(Dst, Lowered, Flags);
     MI.eraseFromParent();
     return true;
@@ -3777,7 +3896,8 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
   // library behavior. Also, is known-not-daz source sufficient?
   if (allowApproxFunc(MF, Flags)) {
-    legalizeFExpUnsafe(B, Dst, X, Flags);
+    IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
+            : legalizeFExpUnsafe(B, Dst, X, Flags);
     MI.eraseFromParent();
     return true;
   }
@@ -4702,6 +4822,14 @@ bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
   return true;
 }
 
+MachinePointerInfo
+AMDGPULegalizerInfo::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+  // This isn't really a constant pool but close enough.
+  MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+  PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+  return PtrInfo;
+}
+
 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
                                                      int64_t Offset) const {
   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -4729,8 +4857,8 @@ bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
          "unexpected kernarg parameter type");
 
   Register Ptr = getKernargParameterPtr(B, Offset);
-  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
-  B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
+  MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(B.getMF());
+  B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
               MachineMemOperand::MODereferenceable |
                   MachineMemOperand::MOInvariant);
   MI.eraseFromParent();
@@ -6042,7 +6170,7 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
   // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
   // being added, so we can only safely match a 32-bit addition with no unsigned
   // overflow.
-  bool CheckNUW = AMDGPU::isGFX1250(ST);
+  bool CheckNUW = ST.hasGFX1250Insts();
   std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
       MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
 
@@ -6531,8 +6659,15 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
+  case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
   default:
     llvm_unreachable("unhandled atomic opcode");
@@ -6766,7 +6901,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   }
 
   Observer.changingInstr(MI);
-  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
+  scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
 
   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
@@ -7194,7 +7329,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
                                        MachineRegisterInfo &MRI,
                                        MachineIRBuilder &B) const {
-  if (!ST.isTrapHandlerEnabled() ||
+  if (!ST.hasTrapHandler() ||
       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
     return legalizeTrapEndpgm(MI, MRI, B);
 
@@ -7253,9 +7388,9 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
       return false;
 
     // TODO: can we be smarter about machine pointer info?
-    MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+    MachinePointerInfo PtrInfo = getKernargSegmentPtrInfo(MF);
     MachineMemOperand *MMO = MF.getMachineMemOperand(
-        PtrInfo,
+        PtrInfo.getWithOffset(Offset),
         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
             MachineMemOperand::MOInvariant,
         LLT::scalar(64), commonAlignment(Align(64), Offset));
@@ -7314,7 +7449,7 @@ bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
                                             MachineIRBuilder &B) const {
   // Is non-HSA path or trap-handler disabled? Then, report a warning
   // accordingly
-  if (!ST.isTrapHandlerEnabled() ||
+  if (!ST.hasTrapHandler() ||
       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
     Function &Fn = B.getMF().getFunction();
     Fn.getContext().diagnose(DiagnosticInfoUnsupported(
@@ -7630,6 +7765,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
   switch (IntrID) {
+  case Intrinsic::sponentry:
+    if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
+      // FIXME: The imported pattern checks for i32 instead of p5; if we fix
+      // that we can remove this cast.
+      const LLT S32 = LLT::scalar(32);
+      Register TmpReg = MRI.createGenericVirtualRegister(S32);
+      B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
+
+      Register DstReg = MI.getOperand(0).getReg();
+      B.buildIntToPtr(DstReg, TmpReg);
+      MI.eraseFromParent();
+    } else {
+      int FI = B.getMF().getFrameInfo().CreateFixedObject(
+          1, 0, /*IsImmutable=*/false);
+      B.buildFrameIndex(MI.getOperand(0), FI);
+      MI.eraseFromParent();
+    }
+    return true;
   case Intrinsic::amdgcn_if:
   case Intrinsic::amdgcn_else: {
     MachineInstr *Br = nullptr;
@@ -7717,7 +7870,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_make_buffer_rsrc:
     return legalizePointerAsRsrcIntrin(MI, MRI, B);
   case Intrinsic::amdgcn_kernarg_segment_ptr:
-    if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
+    if (!AMDGPU::isKernel(B.getMF().getFunction())) {
       // This only makes sense to call in a kernel, so just lower to null.
       B.buildConstant(MI.getOperand(0).getReg(), 0);
       MI.eraseFromParent();
@@ -7940,6 +8093,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
+  case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
@@ -8043,6 +8204,26 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
     MI.eraseFromParent();
     return true;
+  case Intrinsic::amdgcn_flat_load_monitor_b32:
+  case Intrinsic::amdgcn_flat_load_monitor_b64:
+  case Intrinsic::amdgcn_flat_load_monitor_b128:
+    assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+    B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(2))
+        .addMemOperand(*MI.memoperands_begin());
+    MI.eraseFromParent();
+    return true;
+  case Intrinsic::amdgcn_global_load_monitor_b32:
+  case Intrinsic::amdgcn_global_load_monitor_b64:
+  case Intrinsic::amdgcn_global_load_monitor_b128:
+    assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+    B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(2))
+        .addMemOperand(*MI.memoperands_begin());
+    MI.eraseFromParent();
+    return true;
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index cd44a9b..1224ee7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -91,8 +91,12 @@ public:
   bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
                           bool IsLog10, unsigned Flags) const;
   bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const;
+  bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src,
+                              unsigned Flags, bool IsExp10) const;
   bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
                           unsigned Flags) const;
+  bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src,
+                            unsigned Flags) const;
   bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -132,6 +136,7 @@ public:
       MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
       unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
 
+  MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
   Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const;
   bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B,
                                    uint64_t Offset,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index aa75534..4de9349 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -47,9 +47,7 @@ namespace llvm {
 
 class AMDGPULibCalls {
 private:
-  const TargetLibraryInfo *TLInfo = nullptr;
-  AssumptionCache *AC = nullptr;
-  DominatorTree *DT = nullptr;
+  SimplifyQuery SQ;
 
   using FuncInfo = llvm::AMDGPULibFunc;
 
@@ -129,11 +127,10 @@ protected:
   }
 
 public:
-  AMDGPULibCalls() = default;
+  AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
 
   bool fold(CallInst *CI);
 
-  void initFunction(Function &F, FunctionAnalysisManager &FAM);
   void initNativeFuncs();
 
   // Replace a normal math function call with that native version
@@ -422,11 +419,11 @@ bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
   return FPOp->isFast();
 }
 
-void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
-  AC = &FAM.getResult<AssumptionAnalysis>(F);
-  TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
-  DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-}
+AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
+    : SQ(F.getParent()->getDataLayout(),
+         &FAM.getResult<TargetLibraryAnalysis>(F),
+         FAM.getCachedResult<DominatorTreeAnalysis>(F),
+         &FAM.getResult<AssumptionAnalysis>(F)) {}
 
 bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
   return AllNative || llvm::is_contained(UseNative, F);
@@ -563,74 +560,6 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
   return true;
 }
 
-static bool isKnownIntegral(const Value *V, const DataLayout &DL,
-                            FastMathFlags FMF) {
-  if (isa<PoisonValue>(V))
-    return true;
-  if (isa<UndefValue>(V))
-    return false;
-
-  if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
-    return CF->getValueAPF().isInteger();
-
-  auto *VFVTy = dyn_cast<FixedVectorType>(V->getType());
-  const Constant *CV = dyn_cast<Constant>(V);
-  if (VFVTy && CV) {
-    unsigned NumElts = VFVTy->getNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = CV->getAggregateElement(i);
-      if (!Elt)
-        return false;
-      if (isa<PoisonValue>(Elt))
-        continue;
-
-      const ConstantFP *CFP = dyn_cast<ConstantFP>(Elt);
-      if (!CFP || !CFP->getValue().isInteger())
-        return false;
-    }
-
-    return true;
-  }
-
-  const Instruction *I = dyn_cast<Instruction>(V);
-  if (!I)
-    return false;
-
-  switch (I->getOpcode()) {
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-    // TODO: Could check nofpclass(inf) on incoming argument
-    if (FMF.noInfs())
-      return true;
-
-    // Need to check int size cannot produce infinity, which computeKnownFPClass
-    // knows how to do already.
-    return isKnownNeverInfinity(I, SimplifyQuery(DL));
-  case Instruction::Call: {
-    const CallInst *CI = cast<CallInst>(I);
-    switch (CI->getIntrinsicID()) {
-    case Intrinsic::trunc:
-    case Intrinsic::floor:
-    case Intrinsic::ceil:
-    case Intrinsic::rint:
-    case Intrinsic::nearbyint:
-    case Intrinsic::round:
-    case Intrinsic::roundeven:
-      return (FMF.noInfs() && FMF.noNaNs()) ||
-             isKnownNeverInfOrNaN(I, SimplifyQuery(DL));
-    default:
-      break;
-    }
-
-    break;
-  }
-  default:
-    break;
-  }
-
-  return false;
-}
-
 // This function returns false if no change; return true otherwise.
 bool AMDGPULibCalls::fold(CallInst *CI) {
   Function *Callee = CI->getCalledFunction();
@@ -753,16 +682,14 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
 
       // pow(x, y) -> powr(x, y) for x >= -0.0
       // TODO: Account for flags on current call
-      if (PowrFunc &&
-          cannotBeOrderedLessThanZero(
-              FPOp->getOperand(0),
-              SimplifyQuery(M->getDataLayout(), TLInfo, DT, AC, Call))) {
+      if (PowrFunc && cannotBeOrderedLessThanZero(
+                          FPOp->getOperand(0), SQ.getWithInstruction(Call))) {
         Call->setCalledFunction(PowrFunc);
         return fold_pow(FPOp, B, PowrInfo) || true;
       }
 
       // pow(x, y) -> pown(x, y) for known integral y
-      if (isKnownIntegral(FPOp->getOperand(1), M->getDataLayout(),
+      if (isKnownIntegral(FPOp->getOperand(1), SQ.getWithInstruction(CI),
                           FPOp->getFastMathFlags())) {
         FunctionType *PownType = getPownType(CI->getFunctionType());
         AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
@@ -845,7 +772,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
           return false;
         }
       }
-      LLVMContext &context = CI->getParent()->getParent()->getContext();
+      LLVMContext &context = CI->getContext();
       Constant *nval;
       if (getArgType(FInfo) == AMDGPULibFunc::F32) {
         SmallVector<float, 0> FVal;
@@ -1084,7 +1011,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
   if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
     // We cannot handle corner cases for a general pow() function, give up
     // unless y is a constant integral value. Then proceed as if it were pown.
-    if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags()))
+    if (!isKnownIntegral(opr1, SQ.getWithInstruction(cast<Instruction>(FPOp)),
+                         FPOp->getFastMathFlags()))
       return false;
   }
 
@@ -1113,22 +1041,33 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
     opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
   }
   nval = B.CreateFMul(opr1, nval, "__ylogx");
-  nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
+
+  CallInst *Exp2Call = CreateCallEx(B, ExpExpr, nval, "__exp2");
+
+  // TODO: Generalized fpclass logic for pow
+  FPClassTest KnownNot = FPClassTest::fcNegative;
+  if (FPOp->hasNoNaNs())
+    KnownNot |= FPClassTest::fcNan;
+
+  Exp2Call->addRetAttr(
+      Attribute::getWithNoFPClass(Exp2Call->getContext(), KnownNot));
+  nval = Exp2Call;
 
   if (needcopysign) {
     Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
     Type *nTy = FPOp->getType()->getWithNewType(nTyS);
-    unsigned size = nTy->getScalarSizeInBits();
     Value *opr_n = FPOp->getOperand(1);
     if (opr_n->getType()->getScalarType()->isIntegerTy())
       opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
     else
       opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
 
+    unsigned size = nTy->getScalarSizeInBits();
     Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
     sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
-    nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
-    nval = B.CreateBitCast(nval, opr0->getType());
+
+    nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+                            nullptr, "__pow_sign");
   }
 
   LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
@@ -1333,7 +1272,7 @@ AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
   // TODO: Is it worth trying to preserve the location for the cos calls for the
   // load?
 
-  LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
+  LoadInst *LoadCos = B.CreateLoad(Arg->getType(), Alloc);
   return {SinCos, LoadCos, SinCos};
 }
 
@@ -1699,9 +1638,8 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
 
 PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
                                                   FunctionAnalysisManager &AM) {
-  AMDGPULibCalls Simplifier;
+  AMDGPULibCalls Simplifier(F, AM);
   Simplifier.initNativeFuncs();
-  Simplifier.initFunction(F, AM);
 
   bool Changed = false;
 
@@ -1728,9 +1666,8 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
   if (UseNative.empty())
     return PreservedAnalyses::all();
 
-  AMDGPULibCalls Simplifier;
+  AMDGPULibCalls Simplifier(F, AM);
   Simplifier.initNativeFuncs();
-  Simplifier.initFunction(F, AM);
 
   bool Changed = false;
   for (auto &BB : F) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a59132..05e97d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
     MemSetInst &MSI) {
   if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
     return false;
-  llvm::expandMemSetAsLoop(&MSI);
+  llvm::expandMemSetAsLoop(&MSI,
+                           TM->getTargetTransformInfo(*MSI.getFunction()));
   MSI.eraseFromParent();
   return true;
 }
@@ -1565,8 +1566,11 @@ void SplitPtrStructs::processConditionals() {
     } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
         if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
-          ConditionalTemps.push_back(RsrcInst);
-          RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          // Guard against conditionals that were already folded away.
+          if (RsrcInst != *MaybeRsrc) {
+            ConditionalTemps.push_back(RsrcInst);
+            RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          }
         }
         for (Value *V : Seen)
           FoundRsrcs[V] = *MaybeRsrc;
@@ -1745,6 +1749,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
     case AtomicRMWInst::FMin:
       IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
       break;
+    case AtomicRMWInst::USubCond:
+      IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32;
+      break;
+    case AtomicRMWInst::USubSat:
+      IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32;
+      break;
     case AtomicRMWInst::FSub: {
       reportFatalUsageError(
           "atomic floating point subtraction not supported for "
@@ -1770,14 +1780,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
       break;
     case AtomicRMWInst::UIncWrap:
     case AtomicRMWInst::UDecWrap:
-      reportFatalUsageError("wrapping increment/decrement not supported for "
-                            "buffer resources and should've ben expanded away");
+      reportFatalUsageError(
+          "wrapping increment/decrement not supported for "
+          "buffer resources and should've been expanded away");
       break;
     case AtomicRMWInst::BAD_BINOP:
       llvm_unreachable("Not sure how we got a bad binop");
-    case AtomicRMWInst::USubCond:
-    case AtomicRMWInst::USubSat:
-      break;
     }
   }
 
@@ -2059,17 +2067,7 @@ PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) {
          "Pointer comparison is only equal or unequal");
   auto [LhsRsrc, LhsOff] = getPtrParts(Lhs);
   auto [RhsRsrc, RhsOff] = getPtrParts(Rhs);
-  Value *RsrcCmp =
-      IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc");
-  copyMetadata(RsrcCmp, &Cmp);
-  Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off");
-  copyMetadata(OffCmp, &Cmp);
-
-  Value *Res = nullptr;
-  if (Pred == ICmpInst::ICMP_EQ)
-    Res = IRB.CreateAnd(RsrcCmp, OffCmp);
-  else if (Pred == ICmpInst::ICMP_NE)
-    Res = IRB.CreateOr(RsrcCmp, OffCmp);
+  Value *Res = IRB.CreateICmp(Pred, LhsOff, RhsOff);
   copyMetadata(Res, &Cmp);
   Res->takeName(&Cmp);
   SplitUsers.insert(&Cmp);
@@ -2210,6 +2208,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
   case Intrinsic::memset_inline:
   case Intrinsic::experimental_memset_pattern:
   case Intrinsic::amdgcn_load_to_lds:
+  case Intrinsic::amdgcn_load_async_to_lds:
     return true;
   }
 }
@@ -2298,7 +2297,8 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
     SplitUsers.insert(&I);
     return {NewRsrc, Off};
   }
-  case Intrinsic::amdgcn_load_to_lds: {
+  case Intrinsic::amdgcn_load_to_lds:
+  case Intrinsic::amdgcn_load_async_to_lds: {
     Value *Ptr = I.getArgOperand(0);
     if (!isSplitFatPtr(Ptr->getType()))
       return {nullptr, nullptr};
@@ -2309,9 +2309,12 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
     Value *ImmOff = I.getArgOperand(3);
     Value *Aux = I.getArgOperand(4);
     Value *SOffset = IRB.getInt32(0);
+    Intrinsic::ID NewIntr =
+        IID == Intrinsic::amdgcn_load_to_lds
+            ? Intrinsic::amdgcn_raw_ptr_buffer_load_lds
+            : Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds;
     Instruction *NewLoad = IRB.CreateIntrinsic(
-        Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
-        {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+        NewIntr, {}, {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
     copyMetadata(NewLoad, &I);
     SplitUsers.insert(&I);
     I.replaceAllUsesWith(NewLoad);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
new file mode 100644
index 0000000..c26e973
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// that require specialized address assignment. It assigns a unique
+// barrier identifier to each named-barrier LDS variable and encodes
+// this identifier within the !absolute_symbol metadata of that global.
+// This encoding ensures that subsequent LDS lowering passes can process these
+// barriers correctly without conflicts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-exec-sync"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                           Function *KF) {
+  bool NeedsReplacement = false;
+  for (Use &U : GV->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (isKernel(*F) && F != KF) {
+        NeedsReplacement = true;
+        break;
+      }
+    }
+  }
+  if (!NeedsReplacement)
+    return GV;
+  // Create a new GV used only by this kernel and its function
+  GlobalVariable *NewGV = new GlobalVariable(
+      M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+      GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+      GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  for (Use &U : make_early_inc_range(GV->uses())) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (!isKernel(*F) || F == KF) {
+        U.getUser()->replaceUsesOfWith(GV, NewGV);
+      }
+    }
+  }
+  return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+                                     uint32_t Address) {
+  LLVMContext &Ctx = M->getContext();
+  auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+  auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+  auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+  GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                  MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+  sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();
+  });
+  return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerExecSyncGlobalVariables(
+    Module &M, LDSUsesInfoTy &LDSUsesInfo,
+    VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+  // The 1st round: give module-absolute assignments
+  int NumAbsolutes = 0;
+  SmallVector<GlobalVariable *> OrderedGVs;
+  for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+    GlobalVariable *GV = K.first;
+    if (!isNamedBarrier(*GV))
+      continue;
+    // give a module-absolute assignment if it is indirectly accessed by
+    // multiple kernels. This is not precise, but we don't want to duplicate
+    // a function when it is called by multiple kernels.
+    if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+      OrderedGVs.push_back(GV);
+    } else {
+      // leave it to the 2nd round, which will give a kernel-relative
+      // assignment if it is only indirectly accessed by one kernel
+      LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+    }
+    LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+  }
+  OrderedGVs = sortByName(std::move(OrderedGVs));
+  for (GlobalVariable *GV : OrderedGVs) {
+    unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+    unsigned BarId = NumAbsolutes + 1;
+    unsigned BarCnt = GV->getGlobalSize(DL) / 16;
+    NumAbsolutes += BarCnt;
+
+    // 4 bits for alignment, 5 bits for the barrier num,
+    // 3 bits for the barrier scope
+    unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+    recordLDSAbsoluteAddress(&M, GV, Offset);
+  }
+  OrderedGVs.clear();
+
+  // The 2nd round: give a kernel-relative assignment for GV that
+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  SmallVector<Function *> OrderedKernels;
+  for (auto &K : LDSUsesInfo.direct_access) {
+    Function *F = K.first;
+    assert(isKernel(*F));
+    OrderedKernels.push_back(F);
+  }
+  OrderedKernels = sortByName(std::move(OrderedKernels));
+
+  DenseMap<Function *, uint32_t> Kernel2BarId;
+  for (Function *F : OrderedKernels) {
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+      if (!isNamedBarrier(*GV))
+        continue;
+
+      LDSUsesInfo.direct_access[F].erase(GV);
+      if (GV->isAbsoluteSymbolRef()) {
+        // already assigned
+        continue;
+      }
+      OrderedGVs.push_back(GV);
+    }
+    OrderedGVs = sortByName(std::move(OrderedGVs));
+    for (GlobalVariable *GV : OrderedGVs) {
+      // GV could also be used directly by other kernels. If so, we need to
+      // create a new GV used only by this kernel and its function.
+      auto NewGV = uniquifyGVPerKernel(M, GV, F);
+      Changed |= (NewGV != GV);
+      unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarId = Kernel2BarId[F];
+      BarId += NumAbsolutes + 1;
+      unsigned BarCnt = GV->getGlobalSize(DL) / 16;
+      Kernel2BarId[F] += BarCnt;
+      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+      recordLDSAbsoluteAddress(&M, NewGV, Offset);
+    }
+    OrderedGVs.clear();
+  }
+  // Also erase those special LDS variables from indirect_access.
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    assert(isKernel(*K.first));
+    for (GlobalVariable *GV : K.second) {
+      if (isNamedBarrier(*GV))
+        K.second.erase(GV);
+    }
+  }
+  return Changed;
+}
+
+static bool runLowerExecSyncGlobals(Module &M) {
+  CallGraph CG = CallGraph(M);
+  bool Changed = false;
+  Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+  // For each kernel, what variables does it access directly or through
+  // callees
+  LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+  // For each variable accessed through callees, which kernels access it
+  VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    Function *F = K.first;
+    assert(isKernel(*F));
+    for (GlobalVariable *GV : K.second) {
+      LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+    }
+  }
+
+  if (LDSUsesInfo.HasSpecialGVs) {
+    // Special LDS variables need special address assignment
+    Changed |= lowerExecSyncGlobalVariables(
+        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+  }
+  return Changed;
+}
+
+class AMDGPULowerExecSyncLegacy : public ModulePass {
+public:
+  static char ID;
+  AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerExecSyncLegacy::ID = 0;
+char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+                      "AMDGPU lowering of execution synchronization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
+                    "AMDGPU lowering of execution synchronization", false,
+                    false)
+
+bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
+  return runLowerExecSyncGlobals(M);
+}
+
+ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
+  return new AMDGPULowerExecSyncLegacy();
+}
+
+PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
+                                    : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index dec781d..f93b0b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -12,14 +12,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUAsanInstrumentation.h"
 #include "GCNSubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Target/TargetMachine.h"
+#include <optional>
+#include <string>
 
 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
 
@@ -37,6 +49,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetPassConfig>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.setPreservesAll();
  }
 };
@@ -58,13 +71,131 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
   return InsPt;
 }
 
-static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
+static void addAliasScopeMetadata(Function &F, const DataLayout &DL,
+                                  DominatorTree &DT) {
+  // Collect noalias arguments.
+  SmallVector<const Argument *, 4u> NoAliasArgs;
+
+  for (Argument &Arg : F.args())
+    if (Arg.hasNoAliasAttr() && !Arg.use_empty())
+      NoAliasArgs.push_back(&Arg);
+
+  if (NoAliasArgs.empty())
+    return;
+
+  // Add alias scopes for each noalias argument.
+  MDBuilder MDB(F.getContext());
+  DenseMap<const Argument *, MDNode *> NewScopes;
+  MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName());
+
+  for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) {
+    const Argument *Arg = NoAliasArgs[I];
+    MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Arg->getName());
+    NewScopes.insert({Arg, NewScope});
+  }
+
+  // Iterate over all instructions.
+  for (inst_iterator Inst = inst_begin(F), InstEnd = inst_end(F);
+       Inst != InstEnd; ++Inst) {
+    // If instruction accesses memory, collect its pointer arguments.
+    Instruction *I = &(*Inst);
+    SmallVector<const Value *, 2u> PtrArgs;
+
+    if (std::optional<MemoryLocation> MO = MemoryLocation::getOrNone(I))
+      PtrArgs.push_back(MO->Ptr);
+    else if (const CallBase *Call = dyn_cast<CallBase>(I)) {
+      if (Call->doesNotAccessMemory())
+        continue;
+
+      for (Value *Arg : Call->args()) {
+        if (!Arg->getType()->isPointerTy())
+          continue;
+
+        PtrArgs.push_back(Arg);
+      }
+    }
+
+    if (PtrArgs.empty())
+      continue;
+
+    // Collect underlying objects of pointer arguments.
+    SmallVector<Metadata *, 4u> Scopes;
+    SmallPtrSet<const Value *, 4u> ObjSet;
+    SmallVector<Metadata *, 4u> NoAliases;
+
+    for (const Value *Val : PtrArgs) {
+      SmallVector<const Value *, 4u> Objects;
+      getUnderlyingObjects(Val, Objects);
+      ObjSet.insert_range(Objects);
+    }
+
+    bool RequiresNoCaptureBefore = false;
+    bool UsesUnknownObject = false;
+    bool UsesAliasingPtr = false;
+
+    for (const Value *Val : ObjSet) {
+      if (isa<ConstantData>(Val))
+        continue;
+
+      if (const Argument *Arg = dyn_cast<Argument>(Val)) {
+        if (!Arg->hasAttribute(Attribute::NoAlias))
+          UsesAliasingPtr = true;
+      } else
+        UsesAliasingPtr = true;
+
+      if (isEscapeSource(Val))
+        RequiresNoCaptureBefore = true;
+      else if (!isa<Argument>(Val) && isIdentifiedObject(Val))
+        UsesUnknownObject = true;
+    }
+
+    if (UsesUnknownObject)
+      continue;
+
+    // Collect noalias scopes for instruction.
+    for (const Argument *Arg : NoAliasArgs) {
+      if (ObjSet.contains(Arg))
+        continue;
+
+      if (!RequiresNoCaptureBefore ||
+          !capturesAnything(PointerMayBeCapturedBefore(
+              Arg, false, I, &DT, false, CaptureComponents::Provenance)))
+        NoAliases.push_back(NewScopes[Arg]);
+    }
+
+    // Add noalias metadata to instruction.
+    if (!NoAliases.empty()) {
+      MDNode *NewMD =
+          MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_noalias),
+                              MDNode::get(F.getContext(), NoAliases));
+      Inst->setMetadata(LLVMContext::MD_noalias, NewMD);
+    }
+
+    // Collect scopes for alias.scope metadata.
+    if (!UsesAliasingPtr)
+      for (const Argument *Arg : NoAliasArgs) {
+        if (ObjSet.count(Arg))
+          Scopes.push_back(NewScopes[Arg]);
+      }
+
+    // Add alias.scope metadata to instruction.
+    if (!Scopes.empty()) {
+      MDNode *NewMD =
+          MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_alias_scope),
+                              MDNode::get(F.getContext(), Scopes));
+      Inst->setMetadata(LLVMContext::MD_alias_scope, NewMD);
+    }
+  }
+}
+
+static bool lowerKernelArguments(Function &F, const TargetMachine &TM,
+                                 DominatorTree &DT) {
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
     return false;
 
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-  LLVMContext &Ctx = F.getParent()->getContext();
+  LLVMContext &Ctx = F.getContext();
   const DataLayout &DL = F.getDataLayout();
   BasicBlock &EntryBlock = *F.begin();
   IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));
@@ -86,6 +217,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
       Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
 
   uint64_t ExplicitArgOffset = 0;
+
+  addAliasScopeMetadata(F, F.getParent()->getDataLayout(), DT);
+
   for (Argument &Arg : F.args()) {
     const bool IsByRef = Arg.hasByRefAttr();
     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -124,11 +258,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
            PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
           !ST.hasUsableDSOffset())
         continue;
-
-      // FIXME: We can replace this with equivalent alias.scope/noalias
-      // metadata, but this appears to be a lot of work.
-      if (Arg.hasNoAliasAttr())
-        continue;
     }
 
     auto *VT = dyn_cast<FixedVectorType>(ArgTy);
@@ -215,8 +344,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
       }
     }
 
-    // TODO: Convert noalias arg to !noalias
-
     if (DoShiftOpt) {
       Value *ExtractBits = OffsetDiff == 0 ?
         Load : Builder.CreateLShr(Load, OffsetDiff * 8);
@@ -245,7 +372,8 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
 bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   auto &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
-  return lowerKernelArguments(F, TM);
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return lowerKernelArguments(F, TM, DT);
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
@@ -261,7 +389,8 @@ FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
 
 PreservedAnalyses
 AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) {
-  bool Changed = lowerKernelArguments(F, TM);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  bool Changed = lowerKernelArguments(F, TM, DT);
   if (Changed) {
     // TODO: Preserves a lot more.
     PreservedAnalyses PA;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index fed7a13..fbfb710 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
+//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -66,13 +67,11 @@ public:
 
   bool runOnModule(Module &M) override;
 
-  StringRef getPassName() const override {
-    return "AMDGPU Kernel Attributes";
-  }
+  StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
- }
+  }
 };
 
 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
@@ -98,26 +97,28 @@ static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
 }
 
 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
-  Function *F = CI->getParent()->getParent();
+  Function *F = CI->getFunction();
 
   auto *MD = F->getMetadata("reqd_work_group_size");
   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
 
   const bool HasUniformWorkGroupSize =
-    F->getFnAttribute("uniform-work-group-size").getValueAsBool();
+      F->getFnAttribute("uniform-work-group-size").getValueAsBool();
 
   SmallVector<unsigned> MaxNumWorkgroups =
       AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
                                      /*Size=*/3, /*DefaultVal=*/0);
 
   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      !Intrinsic::getDeclarationIfExists(CI->getModule(),
+                                         Intrinsic::amdgcn_dispatch_ptr) &&
       none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
     return false;
 
   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
-  Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
-  Value *Remainders[3]  = {nullptr, nullptr, nullptr};
-  Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
+  Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+  Value *Remainders[3] = {nullptr, nullptr, nullptr};
+  Value *GridSizes[3] = {nullptr, nullptr, nullptr};
 
   const DataLayout &DL = F->getDataLayout();
 
@@ -230,13 +231,15 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
 
   bool MadeChange = false;
   if (IsV5OrAbove && HasUniformWorkGroupSize) {
-    // Under v5  __ockl_get_local_size returns the value computed by the expression:
+    // Under v5  __ockl_get_local_size returns the value computed by the
+    // expression:
     //
-    //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
+    //   workgroup_id < hidden_block_count ? hidden_group_size :
+    //                                       hidden_remainder
     //
-    // For functions with the attribute uniform-work-group-size=true. we can evaluate
-    // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
-    // for __ockl_get_local_size.
+    // For functions with the attribute uniform-work-group-size=true. we can
+    // evaluate workgroup_id < hidden_block_count as true, and thus
+    // hidden_group_size is returned for __ockl_get_local_size.
     for (int I = 0; I < 3; ++I) {
       Value *BlockCount = BlockCounts[I];
       if (!BlockCount)
@@ -261,7 +264,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     for (Value *Remainder : Remainders) {
       if (!Remainder)
         continue;
-      Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
+      Remainder->replaceAllUsesWith(
+          Constant::getNullValue(Remainder->getType()));
       MadeChange = true;
     }
   } else if (HasUniformWorkGroupSize) { // Pre-V5.
@@ -302,13 +306,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
           continue;
 
         for (User *UMin : ZextGroupSize->users()) {
-          if (match(UMin,
-                    m_UMin(m_Sub(m_Specific(GridSize),
-                                 m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
-                           m_Specific(ZextGroupSize)))) {
+          if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
+                                       m_Mul(GroupIDIntrin,
+                                             m_Specific(ZextGroupSize))),
+                                 m_Specific(ZextGroupSize)))) {
             if (HasReqdWorkGroupSize) {
-              ConstantInt *KnownSize
-                = mdconst::extract<ConstantInt>(MD->getOperand(I));
+              ConstantInt *KnownSize =
+                  mdconst::extract<ConstantInt>(MD->getOperand(I));
               UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
                   KnownSize, UMin->getType(), false, DL));
             } else {
@@ -322,6 +326,49 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     }
   }
 
+  // Upgrade the old method of calculating the block size using the grid size.
+  // We pattern match any case where the implicit argument group size is the
+  // divisor to a dispatch packet grid size read of the same dimension.
+  if (IsV5OrAbove) {
+    for (int I = 0; I < 3; I++) {
+      Value *GroupSize = GroupSizes[I];
+      if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
+        continue;
+
+      for (User *U : GroupSize->users()) {
+        Instruction *Inst = cast<Instruction>(U);
+        if (isa<ZExtInst>(Inst) && !Inst->use_empty())
+          Inst = cast<Instruction>(*Inst->user_begin());
+
+        using namespace llvm::PatternMatch;
+        if (!match(
+                Inst,
+                m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
+                           m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+                           m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
+                       m_Value())))
+          continue;
+
+        IRBuilder<> Builder(Inst);
+
+        Value *GEP = Builder.CreateInBoundsGEP(
+            Builder.getInt8Ty(), CI,
+            {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
+                              HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
+        Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
+        BlockCount->setMetadata(LLVMContext::MD_invariant_load,
+                                MDNode::get(CI->getContext(), {}));
+        BlockCount->setMetadata(LLVMContext::MD_noundef,
+                                MDNode::get(CI->getContext(), {}));
+
+        Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
+        Inst->replaceAllUsesWith(BlockCountExt);
+        Inst->eraseFromParent();
+        MadeChange = true;
+      }
+    }
+  }
+
   // If reqd_work_group_size is set, we can replace work group size with it.
   if (!HasReqdWorkGroupSize)
     return MadeChange;
@@ -340,7 +387,6 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   return MadeChange;
 }
 
-
 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
 // TargetPassConfig for subtarget.
 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
@@ -364,7 +410,6 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
   return MadeChange;
 }
 
-
 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
                       "AMDGPU Kernel Attributes", false, false)
 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
@@ -385,12 +430,14 @@ AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
     return PreservedAnalyses::all();
 
+  bool Changed = false;
   for (Instruction &I : instructions(F)) {
     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
       if (CI->getCalledFunction() == BasePtr)
-        processUse(CI, IsV5OrAbove);
+        Changed |= processUse(CI, IsV5OrAbove);
     }
   }
 
-  return PreservedAnalyses::all();
+  return !Changed ? PreservedAnalyses::all()
+                  : PreservedAnalyses::none().preserveSet<CFGAnalyses>();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524..588eee0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -441,7 +441,7 @@ public:
       return KernelSet;
 
     for (Function &Func : M.functions()) {
-      if (Func.isDeclaration() || !isKernelLDS(&Func))
+      if (Func.isDeclaration() || !isKernel(Func))
         continue;
       for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
         if (VariableSet.contains(GV)) {
@@ -501,9 +501,7 @@ public:
         // strategy
         continue;
       }
-      CandidateTy Candidate(
-          GV, K.second.size(),
-          DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
+      CandidateTy Candidate(GV, K.second.size(), GV->getGlobalSize(DL));
       if (MostUsed < Candidate)
         MostUsed = Candidate;
     }
@@ -555,7 +553,7 @@ public:
       for (Function &Func : M->functions()) {
         if (Func.isDeclaration())
           continue;
-        if (!isKernelLDS(&Func))
+        if (!isKernel(Func))
           continue;
 
         if (KernelsThatAllocateTableLDS.contains(&Func) ||
@@ -703,7 +701,7 @@ public:
             return false;
           }
           Function *F = I->getFunction();
-          return !isKernelLDS(F);
+          return !isKernel(*F);
         });
 
     // Replace uses of module scope variable from kernel functions that
@@ -711,7 +709,7 @@ public:
     // Record on each kernel whether the module scope global is used by it
 
     for (Function &Func : M.functions()) {
-      if (Func.isDeclaration() || !isKernelLDS(&Func))
+      if (Func.isDeclaration() || !isKernel(Func))
         continue;
 
       if (KernelsThatAllocateModuleLDS.contains(&Func)) {
@@ -743,7 +741,7 @@ public:
 
     DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
     for (Function &Func : M.functions()) {
-      if (Func.isDeclaration() || !isKernelLDS(&Func))
+      if (Func.isDeclaration() || !isKernel(Func))
         continue;
 
       DenseSet<GlobalVariable *> KernelUsedVariables;
@@ -828,7 +826,7 @@ public:
     // semantics. Setting the alignment here allows this IR pass to accurately
     // predict the exact constant at which it will be allocated.
 
-    assert(isKernelLDS(func));
+    assert(isKernel(*func));
 
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
@@ -878,7 +876,7 @@ public:
       for (auto &func : OrderedKernels) {
 
         if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
-          assert(isKernelLDS(func));
+          assert(isKernel(*func));
           if (!func->hasName()) {
             reportFatalUsageError("anonymous kernels cannot use LDS variables");
           }
@@ -912,7 +910,7 @@ public:
           auto *I = dyn_cast<Instruction>(U.getUser());
           if (!I)
             continue;
-          if (isKernelLDS(I->getFunction()))
+          if (isKernel(*I->getFunction()))
             continue;
 
           replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
@@ -922,126 +920,6 @@ public:
     return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
-                                             Function *KF) {
-    bool NeedsReplacement = false;
-    for (Use &U : GV->uses()) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (isKernelLDS(F) && F != KF) {
-          NeedsReplacement = true;
-          break;
-        }
-      }
-    }
-    if (!NeedsReplacement)
-      return GV;
-    // Create a new GV used only by this kernel and its function
-    GlobalVariable *NewGV = new GlobalVariable(
-        M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-        GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-        GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-    NewGV->copyAttributesFrom(GV);
-    for (Use &U : make_early_inc_range(GV->uses())) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (!isKernelLDS(F) || F == KF) {
-          U.getUser()->replaceUsesOfWith(GV, NewGV);
-        }
-      }
-    }
-    return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-      Module &M, LDSUsesInfoTy &LDSUsesInfo,
-      VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-    bool Changed = false;
-    const DataLayout &DL = M.getDataLayout();
-    // The 1st round: give module-absolute assignments
-    int NumAbsolutes = 0;
-    std::vector<GlobalVariable *> OrderedGVs;
-    for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-      GlobalVariable *GV = K.first;
-      if (!isNamedBarrier(*GV))
-        continue;
-      // give a module-absolute assignment if it is indirectly accessed by
-      // multiple kernels. This is not precise, but we don't want to duplicate
-      // a function when it is called by multiple kernels.
-      if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-        OrderedGVs.push_back(GV);
-      } else {
-        // leave it to the 2nd round, which will give a kernel-relative
-        // assignment if it is only indirectly accessed by one kernel
-        LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-      }
-      LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-    }
-    OrderedGVs = sortByName(std::move(OrderedGVs));
-    for (GlobalVariable *GV : OrderedGVs) {
-      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-      unsigned BarId = NumAbsolutes + 1;
-      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-      NumAbsolutes += BarCnt;
-
-      // 4 bits for alignment, 5 bits for the barrier num,
-      // 3 bits for the barrier scope
-      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-      recordLDSAbsoluteAddress(&M, GV, Offset);
-    }
-    OrderedGVs.clear();
-
-    // The 2nd round: give a kernel-relative assignment for GV that
-    // either only indirectly accessed by single kernel or only directly
-    // accessed by multiple kernels.
-    std::vector<Function *> OrderedKernels;
-    for (auto &K : LDSUsesInfo.direct_access) {
-      Function *F = K.first;
-      assert(isKernelLDS(F));
-      OrderedKernels.push_back(F);
-    }
-    OrderedKernels = sortByName(std::move(OrderedKernels));
-
-    llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
-    for (Function *F : OrderedKernels) {
-      for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-        if (!isNamedBarrier(*GV))
-          continue;
-
-        LDSUsesInfo.direct_access[F].erase(GV);
-        if (GV->isAbsoluteSymbolRef()) {
-          // already assigned
-          continue;
-        }
-        OrderedGVs.push_back(GV);
-      }
-      OrderedGVs = sortByName(std::move(OrderedGVs));
-      for (GlobalVariable *GV : OrderedGVs) {
-        // GV could also be used directly by other kernels. If so, we need to
-        // create a new GV used only by this kernel and its function.
-        auto NewGV = uniquifyGVPerKernel(M, GV, F);
-        Changed |= (NewGV != GV);
-        unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-        unsigned BarId = Kernel2BarId[F];
-        BarId += NumAbsolutes + 1;
-        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-        Kernel2BarId[F] += BarCnt;
-        unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-        recordLDSAbsoluteAddress(&M, NewGV, Offset);
-      }
-      OrderedGVs.clear();
-    }
-    // Also erase those special LDS variables from indirect_access.
-    for (auto &K : LDSUsesInfo.indirect_access) {
-      assert(isKernelLDS(K.first));
-      for (GlobalVariable *GV : K.second) {
-        if (isNamedBarrier(*GV))
-          K.second.erase(GV);
-      }
-    }
-    return Changed;
-  }
-
   bool runOnModule(Module &M) {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -1058,18 +936,12 @@ public:
     VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
     for (auto &K : LDSUsesInfo.indirect_access) {
       Function *F = K.first;
-      assert(isKernelLDS(F));
+      assert(isKernel(*F));
       for (GlobalVariable *GV : K.second) {
         LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
       }
     }
 
-    if (LDSUsesInfo.HasSpecialGVs) {
-      // Special LDS variables need special address assignment
-      Changed |= lowerSpecialLDSVariables(
-          M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
-    }
-
     // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
@@ -1157,7 +1029,7 @@ public:
       const DataLayout &DL = M.getDataLayout();
 
       for (Function &Func : M.functions()) {
-        if (Func.isDeclaration() || !isKernelLDS(&Func))
+        if (Func.isDeclaration() || !isKernel(Func))
           continue;
 
         // All three of these are optional. The first variable is allocated at
@@ -1187,14 +1059,14 @@ public:
         if (AllocateModuleScopeStruct) {
           // Allocated at zero, recorded once on construction, not once per
           // kernel
-          Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
+          Offset += MaybeModuleScopeStruct->getGlobalSize(DL);
         }
 
         if (AllocateKernelScopeStruct) {
           GlobalVariable *KernelStruct = Replacement->second.SGV;
           Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
           recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
-          Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
+          Offset += KernelStruct->getGlobalSize(DL);
         }
 
         // If there is dynamic allocation, the alignment needed is included in
@@ -1264,7 +1136,7 @@ private:
       }
 
       Align Alignment = AMDGPU::getAlign(DL, &GV);
-      TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
+      uint64_t GVSize = GV.getGlobalSize(DL);
 
       if (GVSize > 8) {
         // We might want to use a b96 or b128 load/store
@@ -1310,8 +1182,7 @@ private:
           LDSVarsToTransform.begin(), LDSVarsToTransform.end()));
 
       for (GlobalVariable *GV : Sorted) {
-        OptimizedStructLayoutField F(GV,
-                                     DL.getTypeAllocSize(GV->getValueType()),
+        OptimizedStructLayoutField F(GV, GV->getGlobalSize(DL),
                                      AMDGPU::getAlign(DL, GV));
         LayoutFields.emplace_back(F);
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
index 1e6589e..f4872ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp
@@ -43,9 +43,10 @@
 #include "AMDGPULowerVGPREncoding.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
 #include "SIInstrInfo.h"
-#include "llvm/ADT/PackedVector.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -57,21 +58,44 @@ class AMDGPULowerVGPREncoding {
   static constexpr unsigned OpNum = 4;
   static constexpr unsigned BitsPerField = 2;
   static constexpr unsigned NumFields = 4;
-  static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
-  using ModeType = PackedVector<unsigned, BitsPerField,
-                                std::bitset<BitsPerField * NumFields>>;
+  static constexpr unsigned ModeWidth = NumFields * BitsPerField;
+  static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
+  static constexpr unsigned VGPRMSBShift =
+      llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);
 
-  class ModeTy : public ModeType {
-  public:
-    // bitset constructor will set all bits to zero
-    ModeTy() : ModeType(0) {}
+  struct OpMode {
+    // No MSBs set means they are not required to be of a particular value.
+    std::optional<unsigned> MSBits;
+
+    bool update(const OpMode &New, bool &Rewritten) {
+      bool Updated = false;
+      if (New.MSBits) {
+        if (*New.MSBits != MSBits.value_or(0)) {
+          Updated = true;
+          Rewritten |= MSBits.has_value();
+        }
+        MSBits = New.MSBits;
+      }
+      return Updated;
+    }
+  };
+
+  struct ModeTy {
+    OpMode Ops[OpNum];
 
-    operator int64_t() const { return raw_bits().to_ulong(); }
+    bool update(const ModeTy &New, bool &Rewritten) {
+      bool Updated = false;
+      for (unsigned I : seq(OpNum))
+        Updated |= Ops[I].update(New.Ops[I], Rewritten);
+      return Updated;
+    }
 
-    static ModeTy fullMask() {
-      ModeTy M;
-      M.raw_bits().flip();
-      return M;
+    unsigned encode() const {
+      // Layout: [src0 msb, src1 msb, src2 msb, dst msb].
+      unsigned V = 0;
+      for (const auto &[I, Op] : enumerate(Ops))
+        V |= Op.MSBits.value_or(0) << (I * 2);
+      return V;
     }
   };
 
@@ -82,19 +106,15 @@ private:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
 
+  // Current basic block.
+  MachineBasicBlock *MBB;
+
   /// Most recent s_set_* instruction.
   MachineInstr *MostRecentModeSet;
 
-  /// Whether the current mode is known.
-  bool CurrentModeKnown;
-
   /// Current mode bits.
   ModeTy CurrentMode;
 
-  /// Current mask of mode bits that instructions since MostRecentModeSet care
-  /// about.
-  ModeTy CurrentMask;
-
   /// Number of current hard clause instructions.
   unsigned ClauseLen;
 
@@ -108,10 +128,15 @@ private:
   MachineInstr *Clause;
 
   /// Insert mode change before \p I. \returns true if mode was changed.
-  bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
+  bool setMode(ModeTy NewMode, MachineBasicBlock::instr_iterator I);
 
   /// Reset mode to default.
-  void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
+  void resetMode(MachineBasicBlock::instr_iterator I) {
+    ModeTy Mode;
+    for (OpMode &Op : Mode.Ops)
+      Op.MSBits = 0;
+    setMode(Mode, I);
+  }
 
   /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
   std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
@@ -119,49 +144,69 @@ private:
   /// Handle single \p MI. \return true if changed.
   bool runOnMachineInstr(MachineInstr &MI);
 
-  /// Compute the mode and mode mask for a single \p MI given \p Ops operands
+  /// Compute the mode for a single \p MI given \p Ops operands
   /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
   /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
   /// is checked.
-  void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
+  void computeMode(ModeTy &NewMode, MachineInstr &MI,
                    const AMDGPU::OpName Ops[OpNum],
                    const AMDGPU::OpName *Ops2 = nullptr);
 
   /// Check if an instruction \p I is within a clause and returns a suitable
   /// iterator to insert mode change. It may also modify the S_CLAUSE
   /// instruction to extend it or drop the clause if it cannot be adjusted.
-  MachineInstr *handleClause(MachineInstr *I);
+  MachineBasicBlock::instr_iterator
+  handleClause(MachineBasicBlock::instr_iterator I);
+
+  /// Check if an instruction \p I is immediately after another program state
+  /// instruction which it cannot coissue with. If so, insert before that
+  /// instruction to encourage more coissuing.
+  MachineBasicBlock::instr_iterator
+  handleCoissue(MachineBasicBlock::instr_iterator I);
+
+  /// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
+  /// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
+  /// the current mode. \returns true if the instruction was modified or a
+  /// new one was inserted.
+  bool handleSetregMode(MachineInstr &MI);
+
+  /// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
+  /// the VGPR MSB mode value. \returns true if the immediate was changed.
+  bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
 };
 
-bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
-                                      MachineInstr *I) {
-  assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
+bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode,
+                                      MachineBasicBlock::instr_iterator I) {
+  // Record previous mode into high 8 bits of the immediate.
+  int64_t OldModeBits = CurrentMode.encode() << ModeWidth;
 
-  if (CurrentModeKnown) {
-    auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
+  bool Rewritten = false;
+  if (!CurrentMode.update(NewMode, Rewritten))
+    return false;
 
-    if ((Delta & Mask.raw_bits()).none()) {
-      CurrentMask |= Mask;
-      return false;
+  if (MostRecentModeSet && !Rewritten) {
+    // Update MostRecentModeSet with the new mode. It can be either
+    // S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12).
+    if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
+      MachineOperand &Op = MostRecentModeSet->getOperand(0);
+      // Carry old mode bits from the existing instruction.
+      int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
+      Op.setImm(CurrentMode.encode() | OldModeBits);
+    } else {
+      assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
+             "unexpected MostRecentModeSet opcode");
+      updateSetregModeImm(*MostRecentModeSet, CurrentMode.encode());
     }
 
-    if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
-      CurrentMode |= NewMode;
-      CurrentMask |= Mask;
-
-      MostRecentModeSet->getOperand(0).setImm(CurrentMode);
-      return true;
-    }
+    return true;
   }
 
   I = handleClause(I);
-  MostRecentModeSet =
-      BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
-          .addImm(NewMode);
+  I = handleCoissue(I);
+  MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
+                          .addImm(NewMode.encode() | OldModeBits);
 
   CurrentMode = NewMode;
-  CurrentMask = Mask;
-  CurrentModeKnown = true;
   return true;
 }
 
@@ -179,12 +224,10 @@ AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
   return Idx >> 8;
 }
 
-void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
-                                          MachineInstr &MI,
+void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, MachineInstr &MI,
                                           const AMDGPU::OpName Ops[OpNum],
                                           const AMDGPU::OpName *Ops2) {
   NewMode = {};
-  Mask = {};
 
   for (unsigned I = 0; I < OpNum; ++I) {
     MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
@@ -223,31 +266,31 @@ void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
           TII->hasVALU32BitEncoding(MI.getOpcode()))))
       continue;
 
-    NewMode[I] = MSBits.value();
-    Mask[I] = FieldMask;
+    NewMode.Ops[I].MSBits = MSBits.value();
   }
 }
 
 bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
   auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
   if (Ops.first) {
-    ModeTy NewMode, Mask;
-    computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
-    return setMode(NewMode, Mask, &MI);
+    ModeTy NewMode;
+    computeMode(NewMode, MI, Ops.first, Ops.second);
+    return setMode(NewMode, MI.getIterator());
   }
   assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
 
   return false;
 }
 
-MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
   if (!ClauseRemaining)
     return I;
 
   // A clause cannot start with a special instruction, place it right before
   // the clause.
   if (ClauseRemaining == ClauseLen) {
-    I = Clause->getPrevNode();
+    I = Clause->getPrevNode()->getIterator();
     assert(I->isBundle());
     return I;
   }
@@ -272,6 +315,106 @@ MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
   return I;
 }
 
+MachineBasicBlock::instr_iterator
+AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
+  if (I.isEnd())
+    return I;
+
+  // "Program State instructions" are instructions which are used to control
+  // operation of the GPU rather than performing arithmetic. Such instructions
+  // have different coissuing rules w.r.t s_set_vgpr_msb.
+  auto isProgramStateInstr = [this](MachineInstr *MI) {
+    unsigned Opc = MI->getOpcode();
+    return TII->isBarrier(Opc) || TII->isWaitcnt(Opc) ||
+           Opc == AMDGPU::S_DELAY_ALU;
+  };
+
+  while (!I.isEnd() && I != I->getParent()->begin()) {
+    auto Prev = std::prev(I);
+    if (!isProgramStateInstr(&*Prev))
+      return I;
+    I = Prev;
+  }
+
+  return I;
+}
+
+/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
+/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
+/// MODE register uses:  (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
+/// This is a left rotation by 2 bits on an 8-bit value.
+static int64_t convertModeToSetregFormat(int64_t Mode) {
+  assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
+  return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
+}
+
+bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
+                                                  int64_t ModeValue) {
+  assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
+
+  // Convert from S_SET_VGPR_MSB format to MODE register format
+  int64_t SetregMode = convertModeToSetregFormat(ModeValue);
+
+  MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
+  int64_t OldImm = ImmOp->getImm();
+  int64_t NewImm =
+      (OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift);
+  ImmOp->setImm(NewImm);
+  return NewImm != OldImm;
+}
+
+bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
+  using namespace AMDGPU::Hwreg;
+
+  assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
+         "only S_SETREG_IMM32_B32 needs to be handled");
+
+  MachineOperand *SIMM16Op = TII->getNamedOperand(MI, AMDGPU::OpName::simm16);
+  assert(SIMM16Op && "SIMM16Op must be present");
+
+  auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm());
+  (void)Offset;
+  if (HwRegId != ID_MODE)
+    return false;
+
+  int64_t ModeValue = CurrentMode.encode();
+
+  // Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so
+  // imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR
+  // MSBs.
+  if (Size <= VGPRMSBShift) {
+    // This instruction now acts as MostRecentModeSet so it can be updated if
+    // CurrentMode changes via piggybacking.
+    MostRecentModeSet = &MI;
+    return updateSetregModeImm(MI, ModeValue);
+  }
+
+  // Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
+  // cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
+  // MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
+  // in S_SET_VGPR_MSB format, so we need to convert before comparing.
+  MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
+  assert(ImmOp && "ImmOp must be present");
+  int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
+  int64_t SetregModeValue = convertModeToSetregFormat(ModeValue);
+  if (ImmBits12To19 == SetregModeValue) {
+    // Already correct, but we must invalidate MostRecentModeSet because this
+    // instruction will overwrite mode[12:19]. We can't update this instruction
+    // via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
+    // a new s_set_vgpr_msb will be inserted after this instruction.
+    MostRecentModeSet = nullptr;
+    return false;
+  }
+
+  // imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
+  // the original instruction to restore the correct value.
+  MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
+  MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
+                              TII->get(AMDGPU::S_SET_VGPR_MSB))
+                          .addImm(ModeValue);
+  return true;
+}
+
 bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   if (!ST.has1024AddressableVGPRs())
@@ -282,11 +425,10 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
 
   bool Changed = false;
   ClauseLen = ClauseRemaining = 0;
-  CurrentMode.reset();
-  CurrentMask.reset();
-  CurrentModeKnown = true;
+  CurrentMode = {};
   for (auto &MBB : MF) {
     MostRecentModeSet = nullptr;
+    this->MBB = &MBB;
 
     for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
       if (MI.isMetaInstruction())
@@ -294,17 +436,16 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
 
       if (MI.isTerminator() || MI.isCall()) {
         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
-          CurrentMode.reset();
-          CurrentModeKnown = true;
-        } else
-          resetMode(&MI);
+            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
+          CurrentMode = {};
+        else
+          resetMode(MI.getIterator());
         continue;
       }
 
       if (MI.isInlineAsm()) {
         if (TII->hasVGPRUses(MI))
-          resetMode(&MI);
+          resetMode(MI.getIterator());
         continue;
       }
 
@@ -317,20 +458,20 @@ bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
         continue;
       }
 
+      if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
+          ST.hasSetregVGPRMSBFixup()) {
+        Changed |= handleSetregMode(MI);
+        continue;
+      }
+
       Changed |= runOnMachineInstr(MI);
 
       if (ClauseRemaining)
         --ClauseRemaining;
     }
 
-    // If we're falling through to a block that has at least one other
-    // predecessor, we no longer know the mode.
-    MachineBasicBlock *Next = MBB.getNextNode();
-    if (Next && Next->pred_size() >= 2 &&
-        llvm::is_contained(Next->predecessors(), &MBB)) {
-      if (CurrentMode.raw_bits().any())
-        CurrentModeKnown = false;
-    }
+    // Reset the mode if we are falling through.
+    resetMode(MBB.instr_end());
   }
 
   return Changed;
@@ -367,7 +508,5 @@ AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
   if (!AMDGPULowerVGPREncoding().run(MF))
     return PreservedAnalyses::all();
 
-  PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
-  return PA;
+  return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 680e7eb..fc408aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
     OutMI.addOperand(Src);
     return;
   } else if (Opcode == AMDGPU::SI_TCRETURN ||
-             Opcode == AMDGPU::SI_TCRETURN_GFX) {
+             Opcode == AMDGPU::SI_TCRETURN_GFX ||
+             Opcode == AMDGPU::SI_TCRETURN_CHAIN) {
     // TODO: How to use branch immediate and avoid register+add?
     Opcode = AMDGPU::S_SETPC_B64;
   } else if (AMDGPU::getT16D16Helper(Opcode)) {
@@ -243,7 +244,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 
   int MCOpcode = TII->pseudoToMCOpcode(Opcode);
   if (MCOpcode == -1) {
-    LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+    LLVMContext &C = MI->getMF()->getFunction().getContext();
     C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
                 "a target-specific version: " + Twine(MI->getOpcode()));
   }
@@ -332,7 +333,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   StringRef Err;
   if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
-    LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+    LLVMContext &C = MI->getMF()->getFunction().getContext();
     C.emitError("Illegal instruction detected: " + Err);
     MI->print(errs());
   }
@@ -346,7 +347,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
   } else {
     // We don't want these pseudo instructions encoded. They are
-    // placeholder terminator instructions and should only be printed as
+    // placeholder instructions and should only be printed as
     // comments.
     if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
       if (isVerbose())
@@ -360,6 +361,20 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
       return;
     }
 
+    if (MI->getOpcode() == AMDGPU::ASYNCMARK) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" asyncmark");
+      return;
+    }
+
+    if (MI->getOpcode() == AMDGPU::WAIT_ASYNCMARK) {
+      if (isVerbose()) {
+        OutStreamer->emitRawComment(" wait_asyncmark(" +
+                                    Twine(MI->getOperand(0).getImm()) + ")");
+      }
+      return;
+    }
+
     if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) {
       if (isVerbose()) {
         std::string HexString;
@@ -405,6 +420,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
       return;
     }
 
+    unsigned Opc = MI->getOpcode();
+    if (LLVM_UNLIKELY(Opc == TargetOpcode::STATEPOINT ||
+                      Opc == TargetOpcode::STACKMAP ||
+                      Opc == TargetOpcode::PATCHPOINT)) {
+      LLVMContext &Ctx = MI->getMF()->getFunction().getContext();
+      Ctx.emitError("unhandled statepoint-like instruction");
+      OutStreamer->emitRawComment("unsupported statepoint/stackmap/patchpoint");
+      return;
+    }
+
     if (isVerbose())
       if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
         emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(),
@@ -412,7 +437,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
                              *OutStreamer);
 
     if (isVerbose() && MI->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
-      unsigned V = MI->getOperand(0).getImm();
+      unsigned V = MI->getOperand(0).getImm() & 0xff;
       OutStreamer->AddComment(
           " msbs: dst=" + Twine(V >> 6) + " src0=" + Twine(V & 3) +
           " src1=" + Twine((V >> 2) & 3) + " src2=" + Twine((V >> 4) & 3));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
index 75e3d8c..a541a26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp
@@ -13,13 +13,61 @@
 
 #include "AMDGPUMIRFormatter.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/TargetParser/TargetParser.h"
 
 using namespace llvm;
 
+const char SWaitAluImmPrefix = '.';
+StringLiteral SWaitAluDelim = "_";
+
+StringLiteral VaVdstName = "VaVdst";
+StringLiteral VaSdstName = "VaSdst";
+StringLiteral VaSsrcName = "VaSsrc";
+StringLiteral HoldCntName = "HoldCnt";
+StringLiteral VmVsrcName = "VmVsrc";
+StringLiteral VaVccName = "VaVcc";
+StringLiteral SaSdstName = "SaSdst";
+
+StringLiteral AllOff = "AllOff";
+
+void AMDGPUMIRFormatter::printSWaitAluImm(uint64_t Imm, raw_ostream &OS) const {
+  bool NonePrinted = true;
+  ListSeparator Delim(SWaitAluDelim);
+  auto PrintFieldIfNotMax = [&](StringRef Descr, uint64_t Num, unsigned Max) {
+    if (Num != Max) {
+      OS << Delim << Descr << SWaitAluDelim << Num;
+      NonePrinted = false;
+    }
+  };
+  OS << SWaitAluImmPrefix;
+  PrintFieldIfNotMax(VaVdstName, AMDGPU::DepCtr::decodeFieldVaVdst(Imm),
+                     AMDGPU::DepCtr::getVaVdstBitMask());
+  PrintFieldIfNotMax(VaSdstName, AMDGPU::DepCtr::decodeFieldVaSdst(Imm),
+                     AMDGPU::DepCtr::getVaSdstBitMask());
+  PrintFieldIfNotMax(VaSsrcName, AMDGPU::DepCtr::decodeFieldVaSsrc(Imm),
+                     AMDGPU::DepCtr::getVaSsrcBitMask());
+  PrintFieldIfNotMax(
+      HoldCntName,
+      AMDGPU::DepCtr::decodeFieldHoldCnt(Imm,
+                                         AMDGPU::getIsaVersion(STI.getCPU())),
+      AMDGPU::DepCtr::getHoldCntBitMask(AMDGPU::getIsaVersion(STI.getCPU())));
+  PrintFieldIfNotMax(VmVsrcName, AMDGPU::DepCtr::decodeFieldVmVsrc(Imm),
+                     AMDGPU::DepCtr::getVmVsrcBitMask());
+  PrintFieldIfNotMax(VaVccName, AMDGPU::DepCtr::decodeFieldVaVcc(Imm),
+                     AMDGPU::DepCtr::getVaVccBitMask());
+  PrintFieldIfNotMax(SaSdstName, AMDGPU::DepCtr::decodeFieldSaSdst(Imm),
+                     AMDGPU::DepCtr::getSaSdstBitMask());
+  if (NonePrinted)
+    OS << AllOff;
+}
+
 void AMDGPUMIRFormatter::printImm(raw_ostream &OS, const MachineInstr &MI,
                       std::optional<unsigned int> OpIdx, int64_t Imm) const {
 
   switch (MI.getOpcode()) {
+  case AMDGPU::S_WAITCNT_DEPCTR:
+    printSWaitAluImm(Imm, OS);
+    break;
   case AMDGPU::S_DELAY_ALU:
     assert(OpIdx == 0);
     printSDelayAluImm(Imm, OS);
@@ -39,6 +87,8 @@ bool AMDGPUMIRFormatter::parseImmMnemonic(const unsigned OpCode,
 {
 
   switch (OpCode) {
+  case AMDGPU::S_WAITCNT_DEPCTR:
+    return parseSWaitAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
   case AMDGPU::S_DELAY_ALU:
     return parseSDelayAluImmMnemonic(OpIdx, Imm, Src, ErrorCallback);
   default:
@@ -90,6 +140,89 @@ void AMDGPUMIRFormatter::printSDelayAluImm(int64_t Imm,
   Outdep(Id1);
 }
 
+bool AMDGPUMIRFormatter::parseSWaitAluImmMnemonic(
+    const unsigned int OpIdx, int64_t &Imm, StringRef &Src,
+    MIRFormatter::ErrorCallbackType &ErrorCallback) const {
+  // TODO: For now accept integer masks for compatibility with old MIR.
+  if (!Src.consumeInteger(10, Imm))
+    return false;
+
+  // Initialize with all checks off.
+  Imm = AMDGPU::DepCtr::getDefaultDepCtrEncoding(STI);
+  // The input is in the form: .Name1_Num1_Name2_Num2
+  // Drop the '.' prefix.
+  bool ConsumePrefix = Src.consume_front(SWaitAluImmPrefix);
+  if (!ConsumePrefix)
+    return ErrorCallback(Src.begin(), "expected prefix");
+  if (Src.empty())
+    return ErrorCallback(Src.begin(), "expected <CounterName>_<CounterNum>");
+
+  // Special case for all off.
+  if (Src == AllOff)
+    return false;
+
+  // Parse a counter name, number pair in each iteration.
+  while (!Src.empty()) {
+    // Src: Name1_Num1_Name2_Num2
+    //           ^
+    size_t DelimIdx = Src.find(SWaitAluDelim);
+    if (DelimIdx == StringRef::npos)
+      return ErrorCallback(Src.begin(), "expected <CounterName>_<CounterNum>");
+    // Src: Name1_Num1_Name2_Num2
+    //      ^^^^^
+    StringRef Name = Src.substr(0, DelimIdx);
+    // Save the position of the name for accurate error reporting.
+    StringRef::iterator NamePos = Src.begin();
+    [[maybe_unused]] bool ConsumeName = Src.consume_front(Name);
+    assert(ConsumeName && "Expected name");
+    [[maybe_unused]] bool ConsumeDelim = Src.consume_front(SWaitAluDelim);
+    assert(ConsumeDelim && "Expected delimiter");
+    // Src:       Num1_Name2_Num2
+    //                ^
+    DelimIdx = Src.find(SWaitAluDelim);
+    // Src:       Num1_Name2_Num2
+    //            ^^^^
+    int64_t Num;
+    // Save the position of the number for accurate error reporting.
+    StringRef::iterator NumPos = Src.begin();
+    if (Src.consumeInteger(10, Num) || Num < 0)
+      return ErrorCallback(NumPos,
+                           "expected non-negative integer counter number");
+    unsigned Max;
+    if (Name == VaVdstName) {
+      Max = AMDGPU::DepCtr::getVaVdstBitMask();
+      Imm = AMDGPU::DepCtr::encodeFieldVaVdst(Imm, Num);
+    } else if (Name == VmVsrcName) {
+      Max = AMDGPU::DepCtr::getVmVsrcBitMask();
+      Imm = AMDGPU::DepCtr::encodeFieldVmVsrc(Imm, Num);
+    } else if (Name == VaSdstName) {
+      Max = AMDGPU::DepCtr::getVaSdstBitMask();
+      Imm = AMDGPU::DepCtr::encodeFieldVaSdst(Imm, Num);
+    } else if (Name == VaSsrcName) {
+      Max = AMDGPU::DepCtr::getVaSsrcBitMask();
+      Imm = AMDGPU::DepCtr::encodeFieldVaSsrc(Imm, Num);
+    } else if (Name == HoldCntName) {
+      const AMDGPU::IsaVersion &Version = AMDGPU::getIsaVersion(STI.getCPU());
+      Max = AMDGPU::DepCtr::getHoldCntBitMask(Version);
+      Imm = AMDGPU::DepCtr::encodeFieldHoldCnt(Imm, Num, Version);
+    } else if (Name == VaVccName) {
+      Max = AMDGPU::DepCtr::getVaVccBitMask();
+      Imm = AMDGPU::DepCtr::encodeFieldVaVcc(Imm, Num);
+    } else if (Name == SaSdstName) {
+      Max = AMDGPU::DepCtr::getSaSdstBitMask();
+      Imm = AMDGPU::DepCtr::encodeFieldSaSdst(Imm, Num);
+    } else {
+      return ErrorCallback(NamePos, "invalid counter name");
+    }
+    // Don't allow the values to reach their maximum value.
+    if (Num >= Max)
+      return ErrorCallback(NumPos, "counter value too large");
+    // Src:            Name2_Num2
+    Src.consume_front(SWaitAluDelim);
+  }
+  return false;
+}
+
 bool AMDGPUMIRFormatter::parseSDelayAluImmMnemonic(
     const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
     llvm::MIRFormatter::ErrorCallbackType &ErrorCallback) const
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index c5c9473..dbfc645 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
 #define LLVM_LIB_TARGET_AMDGPUMIRFORMATTER_H
 
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/MIRFormatter.h"
 
 namespace llvm {
@@ -25,21 +26,20 @@ struct PerFunctionMIParsingState;
 
 class AMDGPUMIRFormatter final : public MIRFormatter {
 public:
-  AMDGPUMIRFormatter() = default;
-  virtual ~AMDGPUMIRFormatter() = default;
+  explicit AMDGPUMIRFormatter(const MCSubtargetInfo &STI) : STI(STI) {}
+  ~AMDGPUMIRFormatter() override = default;
 
   /// Implement target specific printing for machine operand immediate value, so
   /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
   /// None to OpIdx means the index is unknown.
-  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
-                        std::optional<unsigned> OpIdx,
-                        int64_t Imm) const override;
+  void printImm(raw_ostream &OS, const MachineInstr &MI,
+                std::optional<unsigned> OpIdx, int64_t Imm) const override;
 
   /// Implement target specific parsing of immediate mnemonics. The mnemonic is
   /// a string with a leading dot.
-  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
-                                StringRef Src, int64_t &Imm,
-                                ErrorCallbackType ErrorCallback) const override;
+  bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                        StringRef Src, int64_t &Imm,
+                        ErrorCallbackType ErrorCallback) const override;
 
   /// Implement target specific parsing of target custom pseudo source value.
   bool
@@ -49,9 +49,17 @@ public:
                                ErrorCallbackType ErrorCallback) const override;
 
 private:
+  const MCSubtargetInfo &STI;
+  /// Prints the string to represent s_wait_alu immediate value.
+  void printSWaitAluImm(uint64_t Imm, raw_ostream &OS) const;
   /// Print the string to represent s_delay_alu immediate value
   void printSDelayAluImm(int64_t Imm, llvm::raw_ostream &OS) const;
 
+  /// Parse the immediate pseudo literal for s_wait_alu
+  bool parseSWaitAluImmMnemonic(
+      const unsigned int OpIdx, int64_t &Imm, StringRef &Src,
+      MIRFormatter::ErrorCallbackType &ErrorCallback) const;
+
   /// Parse the immediate pseudo literal for s_delay_alu
   bool parseSDelayAluImmMnemonic(
       const unsigned int OpIdx, int64_t &Imm, llvm::StringRef &Src,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 664a15c..1730757 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -80,11 +80,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
     ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
 
-  // FIXME: Shouldn't be target specific
-  Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");
-  NoSignedZerosFPMath =
-      NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true";
-
   const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F);
   if (DynLdsGlobal || hasLDSKernelArgument(F))
     UsesDynamicLDS = true;
@@ -107,7 +102,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
       if (!BarAddr)
         llvm_unreachable("named barrier should have an assigned address");
       Entry.first->second = BarAddr.value();
-      unsigned BarCnt = DL.getTypeAllocSize(GV.getValueType()) / 16;
+      unsigned BarCnt = GV.getGlobalSize(DL) / 16;
       recordNumNamedBarriers(BarAddr.value(), BarCnt);
       return BarAddr.value();
     }
@@ -135,8 +130,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
         // section, and not within some other non-absolute-address object
         // allocated here, but the extra error detection is minimal and we would
         // have to pass the Function around or cache the attribute value.
-        uint32_t ObjectEnd =
-            ObjectStart + DL.getTypeAllocSize(GV.getValueType());
+        uint32_t ObjectEnd = ObjectStart + GV.getGlobalSize(DL);
         if (ObjectEnd > StaticLDSSize) {
           report_fatal_error(
               "Absolute address LDS variable outside of static frame");
@@ -152,7 +146,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
     /// during lowering.
     Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
 
-    StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+    StaticLDSSize += GV.getGlobalSize(DL);
 
     // Align LDS size to trailing, e.g. for aligning dynamic shared memory
     LDSSize = alignTo(StaticLDSSize, Trailing);
@@ -161,7 +155,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
            "expected region address space");
 
     Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
-    StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
+    StaticGDSSize += GV.getGlobalSize(DL);
 
     // FIXME: Apply alignment of dynamic GDS
     GDSSize = StaticGDSSize;
@@ -210,7 +204,7 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
                                            const GlobalVariable &GV) {
   const Module *M = F.getParent();
   const DataLayout &DL = M->getDataLayout();
-  assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
+  assert(GV.getGlobalSize(DL) == 0);
 
   Align Alignment =
       DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index fc64e16..1317210 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -61,8 +61,6 @@ protected:
   // Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve CC.
   bool IsChainFunction = false;
 
-  bool NoSignedZerosFPMath = false;
-
   // Function may be memory bound.
   bool MemoryBound = false;
 
@@ -107,10 +105,6 @@ public:
     return isEntryFunction() || isChainFunction();
   }
 
-  bool hasNoSignedZerosFPMath() const {
-    return NoSignedZerosFPMath;
-  }
-
   bool isMemoryBound() const {
     return MemoryBound;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index e17c211..9fbb19d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -70,7 +70,7 @@ bool isDynamicLDS(const GlobalVariable &GV) {
   const DataLayout &DL = M->getDataLayout();
   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
     return false;
-  return DL.getTypeAllocSize(GV.getValueType()) == 0;
+  return GV.getGlobalSize(DL) == 0;
 }
 
 bool isLDSVariableToLower(const GlobalVariable &GV) {
@@ -126,7 +126,7 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
     for (User *V : GV.users()) {
       if (auto *I = dyn_cast<Instruction>(V)) {
         Function *F = I->getFunction();
-        if (isKernelLDS(F))
+        if (isKernel(*F))
           kernels[F].insert(&GV);
         else
           Functions[F].insert(&GV);
@@ -135,10 +135,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
   }
 }
 
-bool isKernelLDS(const Function *F) {
-  return AMDGPU::isKernel(F->getCallingConv());
-}
-
 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
 
   FunctionVariableMap DirectMapKernel;
@@ -148,7 +144,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
   // Collect functions whose address has escaped
   DenseSet<Function *> AddressTakenFuncs;
   for (Function &F : M.functions()) {
-    if (!isKernelLDS(&F))
+    if (!isKernel(F))
       if (F.hasAddressTaken(nullptr,
                             /* IgnoreCallbackUses */ false,
                             /* IgnoreAssumeLikeCalls */ false,
@@ -180,7 +176,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
   // access all variables accessed by functions whose address escaped
   for (Function &F : M.functions()) {
     if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
-      if (!isKernelLDS(&F)) {
+      if (!isKernel(F)) {
         set_union(TransitiveMapFunction[&F],
                   VariablesReachableThroughFunctionPointer);
       }
@@ -190,7 +186,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
   // Direct implementation of collecting all variables reachable from each
   // function
   for (Function &Func : M.functions()) {
-    if (Func.isDeclaration() || isKernelLDS(&Func))
+    if (Func.isDeclaration() || isKernel(Func))
       continue;
 
     DenseSet<Function *> seen; // catches cycles
@@ -227,7 +223,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
   FunctionVariableMap IndirectMapKernel;
 
   for (Function &Func : M.functions()) {
-    if (Func.isDeclaration() || !isKernelLDS(&Func))
+    if (Func.isDeclaration() || !isKernel(Func))
       continue;
 
     for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
@@ -273,6 +269,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
   //      this is a re-run of the pass
   //      so we don't have anything to do.
   //    - No variables are absolute.
+  // Named-barriers which are absolute symbols are removed
+  // from the maps.
   std::optional<bool> HasAbsoluteGVs;
   bool HasSpecialGVs = false;
   for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
@@ -284,6 +282,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
         if (IsDirectMapDynLDSGV)
           continue;
         if (isNamedBarrier(*GV)) {
+          if (IsAbsolute) {
+            DirectMapKernel[Fn].erase(GV);
+            IndirectMapKernel[Fn].erase(GV);
+          }
           HasSpecialGVs = true;
           continue;
         }
@@ -335,7 +337,7 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot,
             Function *PotentialCallee =
                 ExternalCallRecord.second->getFunction();
             assert(PotentialCallee);
-            if (!isKernelLDS(PotentialCallee)) {
+            if (!isKernel(*PotentialCallee)) {
               for (StringRef Attr : FnAttrs)
                 PotentialCallee->removeFnAttr(Attr);
             }
@@ -369,6 +371,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
     case Intrinsic::amdgcn_s_barrier_wait:
     case Intrinsic::amdgcn_s_barrier_leave:
     case Intrinsic::amdgcn_s_get_barrier_state:
+    case Intrinsic::amdgcn_s_wakeup_barrier:
     case Intrinsic::amdgcn_wave_barrier:
     case Intrinsic::amdgcn_sched_barrier:
     case Intrinsic::amdgcn_sched_group_barrier:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
index 058e744..8868b93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
@@ -53,8 +53,6 @@ void getUsesOfLDSByFunction(const CallGraph &CG, Module &M,
                             FunctionVariableMap &kernels,
                             FunctionVariableMap &functions);
 
-bool isKernelLDS(const Function *F);
-
 LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M);
 
 /// Strip FnAttr attribute from any functions where we may have
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a6074ea..f464fbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -13,6 +13,12 @@
 
 // NOTE: NO INCLUDE GUARD DESIRED!
 
+#ifndef MODULE_ANALYSIS
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)
+#endif
+MODULE_ANALYSIS("amdgpu-argument-usage", AMDGPUArgumentUsageAnalysis())
+#undef MODULE_ANALYSIS
+
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
@@ -29,8 +35,8 @@ MODULE_PASS("amdgpu-perf-hint",
 MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass())
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS
@@ -69,6 +75,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
 FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index cf2ab825..a3be0f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -48,7 +48,7 @@ private:
   FuncInfoMap FIM;
 
 public:
-  AMDGPUPerfHintAnalysis() {}
+  AMDGPUPerfHintAnalysis() = default;
 
   // OldPM
   bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index e86b473..0264d88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -367,10 +367,10 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
   return TLI->isCanonicalized(Reg, MF);
 }
 
-// The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8,
-// u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined
-// with sign extension instrucions in order to generate buffer_load_{i8, i16}
-// instructions.
+// The buffer_load_{i8, i16} intrinsics are initially lowered as
+// buffer_load_{u8, u16} instructions. Here, the buffer_load_{u8, u16}
+// instructions are combined with sign extension instrucions in order to
+// generate buffer_load_{i8, i16} instructions.
 
 // Identify buffer_load_{u8, u16}.
 bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 6e54737..4a70c5d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -177,8 +177,7 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
 
   Register Src = MatchInfo.Origin;
-  assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
-         LLT::scalar(64));
+  assert(MI.getMF()->getRegInfo().getType(Src) == LLT::scalar(64));
   const LLT S32 = LLT::scalar(32);
 
   auto Unmerge = B.buildUnmerge(S32, Src);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index ffbbf63..7d6e3ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -127,7 +127,7 @@ private:
   // will also be preloaded even if that data is unused.
   Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
     FunctionType *FT = F.getFunctionType();
-    LLVMContext &Ctx = F.getParent()->getContext();
+    LLVMContext &Ctx = F.getContext();
     SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
     for (unsigned I = 0; I <= LastPreloadIndex; ++I)
       FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
@@ -196,7 +196,7 @@ public:
     SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
     for (auto *U : ImplicitArgPtr->users()) {
       Instruction *CI = dyn_cast<Instruction>(U);
-      if (!CI || CI->getParent()->getParent() != &F)
+      if (!CI || CI->getFunction() != &F)
         continue;
 
       for (auto *U : CI->users()) {
@@ -213,7 +213,7 @@ public:
           continue;
 
         // FIXME: Expand handle merged loads.
-        LLVMContext &Ctx = F.getParent()->getContext();
+        LLVMContext &Ctx = F.getContext();
         Type *LoadTy = Load->getType();
         HiddenArg HA = getHiddenArgFromOffset(Offset);
         if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
index 0137b3f..a43600a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -46,10 +46,7 @@ class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
-    initializeAMDGPUPrepareAGPRAllocLegacyPass(
-        *PassRegistry::getPassRegistry());
-  }
+  AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -62,10 +59,8 @@ public:
 };
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
-                      "AMDGPU Prepare AGPR Alloc", false, false)
-INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
-                    "AMDGPU Prepare AGPR Alloc", false, false)
+INITIALIZE_PASS(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+                "AMDGPU Prepare AGPR Alloc", false, false)
 
 char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index f5e14c7..d3fa423 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -129,7 +129,7 @@ static StringRef getAsConstantStr(Value *V) {
 
 static void diagnoseInvalidFormatString(const CallBase *CI) {
   CI->getContext().diagnose(DiagnosticInfoUnsupported(
-      *CI->getParent()->getParent(),
+      *CI->getFunction(),
       "printf format string must be a trivially resolved constant string "
       "global variable",
       CI->getDebugLoc()));
@@ -416,9 +416,13 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
     }
   }
 
-  // erase the printf calls
-  for (auto *CI : Printfs)
+  // Erase the printf calls and replace all uses with 0, signaling success.
+  // Since OpenCL only specifies undefined behaviors and not success criteria,
+  // returning 0 sinalling success always is valid.
+  for (auto *CI : Printfs) {
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0));
     CI->eraseFromParent();
+  }
 
   Printfs.clear();
   return true;
@@ -434,6 +438,17 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
       M.getModuleFlag("openmp"))
     return false;
 
+  // Verify the signature of the printf function and skip if it isn't correct.
+  const FunctionType *PrintfFunctionTy = PrintfFunction->getFunctionType();
+  if (PrintfFunctionTy->getNumParams() != 1 || !PrintfFunctionTy->isVarArg() ||
+      !PrintfFunctionTy->getReturnType()->isIntegerTy(32))
+    return false;
+  Type *PrintfFormatArgTy = PrintfFunctionTy->getParamType(0);
+  if (!PrintfFormatArgTy->isPointerTy() ||
+      !AMDGPU::isFlatGlobalAddrSpace(
+          PrintfFormatArgTy->getPointerAddressSpace()))
+    return false;
+
   for (auto &U : PrintfFunction->uses()) {
     if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
       if (CI->isCallee(&U) && !CI->isNoBuiltin())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ddabd25..ed676c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
@@ -85,6 +86,42 @@ static cl::opt<unsigned>
                             "when sorting profitable allocas"),
                    cl::init(4));
 
+// We support vector indices of the form (A * stride) + B
+// All parts are optional.
+struct GEPToVectorIndex {
+  Value *VarIndex = nullptr;         // defaults to 0
+  ConstantInt *VarMul = nullptr;     // defaults to 1
+  ConstantInt *ConstIndex = nullptr; // defaults to 0
+  Value *Full = nullptr;
+};
+
+struct MemTransferInfo {
+  ConstantInt *SrcIndex = nullptr;
+  ConstantInt *DestIndex = nullptr;
+};
+
+// Analysis for planning the different strategies of alloca promotion.
+struct AllocaAnalysis {
+  AllocaInst *Alloca = nullptr;
+  DenseSet<Value *> Pointers;
+  SmallVector<Use *> Uses;
+  unsigned Score = 0;
+  bool HaveSelectOrPHI = false;
+  struct {
+    FixedVectorType *Ty = nullptr;
+    SmallVector<Instruction *> Worklist;
+    SmallVector<Instruction *> UsersToRemove;
+    MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
+    MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
+  } Vector;
+  struct {
+    bool Enable = false;
+    SmallVector<User *> Worklist;
+  } LDS;
+
+  explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
+};
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
@@ -106,10 +143,7 @@ private:
   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
 
-  /// BaseAlloca is the alloca root the search started from.
-  /// Val may be that alloca or a recursive user of it.
-  bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
-                               std::vector<Value *> &WorkList) const;
+  bool collectAllocaUses(AllocaAnalysis &AA) const;
 
   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
@@ -122,10 +156,16 @@ private:
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
 
-  bool tryPromoteAllocaToVector(AllocaInst &I);
-  bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
+  FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
+  void analyzePromoteToVector(AllocaAnalysis &AA) const;
+  void promoteAllocaToVector(AllocaAnalysis &AA);
+  void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+  bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
+                             SetVector<IntrinsicInst *> &DeferredIntrs);
+  void
+  finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
 
-  void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+  void scoreAlloca(AllocaAnalysis &AA) const;
 
   void setFunctionLimits(const Function &F);
 
@@ -236,53 +276,77 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() {
   return new AMDGPUPromoteAlloca();
 }
 
-static void collectAllocaUses(AllocaInst &Alloca,
-                              SmallVectorImpl<Use *> &Uses) {
-  SmallVector<Instruction *, 4> WorkList({&Alloca});
+bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
+  const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+    LLVM_DEBUG(dbgs() << "  Cannot promote alloca: " << Msg << "\n"
+                      << "    " << *Inst << "\n");
+    return false;
+  };
+
+  SmallVector<Instruction *, 4> WorkList({AA.Alloca});
   while (!WorkList.empty()) {
     auto *Cur = WorkList.pop_back_val();
+    if (find(AA.Pointers, Cur) != AA.Pointers.end())
+      continue;
+    AA.Pointers.insert(Cur);
     for (auto &U : Cur->uses()) {
-      Uses.push_back(&U);
+      auto *Inst = cast<Instruction>(U.getUser());
+      if (isa<StoreInst>(Inst)) {
+        if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
+          return RejectUser(Inst, "pointer escapes via store");
+        }
+      }
+      AA.Uses.push_back(&U);
+
+      if (isa<GetElementPtrInst>(U.getUser())) {
+        WorkList.push_back(Inst);
+      } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
+        // Only promote a select if we know that the other select operand is
+        // from another pointer that will also be promoted.
+        if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
+          return RejectUser(Inst, "select from mixed objects");
+        WorkList.push_back(Inst);
+        AA.HaveSelectOrPHI = true;
+      } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+        // Repeat for phis.
+
+        // TODO: Handle more complex cases. We should be able to replace loops
+        // over arrays.
+        switch (Phi->getNumIncomingValues()) {
+        case 1:
+          break;
+        case 2:
+          if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
+            return RejectUser(Inst, "phi from mixed objects");
+          break;
+        default:
+          return RejectUser(Inst, "phi with too many operands");
+        }
 
-      if (isa<GetElementPtrInst>(U.getUser()))
-        WorkList.push_back(cast<Instruction>(U.getUser()));
+        WorkList.push_back(Inst);
+        AA.HaveSelectOrPHI = true;
+      }
     }
   }
+  return true;
 }
 
-void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
-    SmallVectorImpl<AllocaInst *> &Allocas) {
-  DenseMap<AllocaInst *, unsigned> Scores;
-
-  for (auto *Alloca : Allocas) {
-    LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
-    unsigned &Score = Scores[Alloca];
-    // Increment score by one for each user + a bonus for users within loops.
-    SmallVector<Use *, 8> Uses;
-    collectAllocaUses(*Alloca, Uses);
-    for (auto *U : Uses) {
-      Instruction *Inst = cast<Instruction>(U->getUser());
-      if (isa<GetElementPtrInst>(Inst))
-        continue;
-      unsigned UserScore =
-          1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
-      LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
-      Score += UserScore;
-    }
-    LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
+  LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
+  unsigned Score = 0;
+  // Increment score by one for each user + a bonus for users within loops.
+  for (auto *U : AA.Uses) {
+    Instruction *Inst = cast<Instruction>(U->getUser());
+    if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+        isa<PHINode>(Inst))
+      continue;
+    unsigned UserScore =
+        1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+    LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
+    Score += UserScore;
   }
-
-  stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
-    return Scores.at(A) > Scores.at(B);
-  });
-
-  // clang-format off
-  LLVM_DEBUG(
-    dbgs() << "Sorted Worklist:\n";
-    for (auto *A: Allocas)
-      dbgs() << "  " << *A << "\n";
-  );
-  // clang-format on
+  LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+  AA.Score = Score;
 }
 
 void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
@@ -307,7 +371,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   DL = &Mod->getDataLayout();
 
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
-  if (!ST.isPromoteAllocaEnabled())
+  if (!ST.enablePromoteAlloca())
     return false;
 
   bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
@@ -319,27 +383,49 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
                                   : (MaxVGPRs * 32)) /
       VGPRBudgetRatio;
 
-  SmallVector<AllocaInst *, 16> Allocas;
+  std::vector<AllocaAnalysis> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       // Array allocations are probably not worth handling, since an allocation
       // of the array type is the canonical form.
       if (!AI->isStaticAlloca() || AI->isArrayAllocation())
         continue;
-      Allocas.push_back(AI);
+
+      LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
+
+      AllocaAnalysis AA{AI};
+      if (collectAllocaUses(AA)) {
+        analyzePromoteToVector(AA);
+        if (PromoteToLDS)
+          analyzePromoteToLDS(AA);
+        if (AA.Vector.Ty || AA.LDS.Enable) {
+          scoreAlloca(AA);
+          Allocas.push_back(std::move(AA));
+        }
+      }
     }
   }
 
-  sortAllocasToPromote(Allocas);
+  stable_sort(Allocas,
+              [](const auto &A, const auto &B) { return A.Score > B.Score; });
+
+  // clang-format off
+  LLVM_DEBUG(
+    dbgs() << "Sorted Worklist:\n";
+    for (const auto &AA : Allocas)
+      dbgs() << "  " << *AA.Alloca << "\n";
+  );
+  // clang-format on
 
   bool Changed = false;
-  for (AllocaInst *AI : Allocas) {
-    const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
-    // First, check if we have enough budget to vectorize this alloca.
-    if (AllocaCost <= VectorizationBudget) {
-      // If we do, attempt vectorization, otherwise, fall through and try
-      // promoting to LDS instead.
-      if (tryPromoteAllocaToVector(*AI)) {
+  SetVector<IntrinsicInst *> DeferredIntrs;
+  for (AllocaAnalysis &AA : Allocas) {
+    if (AA.Vector.Ty) {
+      const unsigned AllocaCost =
+          DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
+      // First, check if we have enough budget to vectorize this alloca.
+      if (AllocaCost <= VectorizationBudget) {
+        promoteAllocaToVector(AA);
         Changed = true;
         assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
                "Underflow!");
@@ -347,16 +433,18 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
         LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
                           << VectorizationBudget << "\n");
         continue;
+      } else {
+        LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
+                          << AllocaCost << ", budget:" << VectorizationBudget
+                          << "): " << *AA.Alloca << "\n");
       }
-    } else {
-      LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
-                        << AllocaCost << ", budget:" << VectorizationBudget
-                        << "): " << *AI << "\n");
     }
 
-    if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+    if (AA.LDS.Enable &&
+        tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
       Changed = true;
   }
+  finishDeferredAllocaToLDSPromotion(DeferredIntrs);
 
   // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
   // dangling pointers. If we want to reuse it past this point, the loop above
@@ -365,11 +453,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   return Changed;
 }
 
-struct MemTransferInfo {
-  ConstantInt *SrcIndex = nullptr;
-  ConstantInt *DestIndex = nullptr;
-};
-
 // Checks if the instruction I is a memset user of the alloca AI that we can
 // deal with. Currently, only non-volatile memsets that affect the whole alloca
 // are handled.
@@ -387,23 +470,48 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
 }
 
-static Value *calculateVectorIndex(
-    Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
-  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
-  if (!GEP)
-    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
+  IRBuilder<> B(Ptr->getContext());
+
+  Ptr = Ptr->stripPointerCasts();
+  if (Ptr == AA.Alloca)
+    return B.getInt32(0);
+
+  auto *GEP = cast<GetElementPtrInst>(Ptr);
+  auto I = AA.Vector.GEPVectorIdx.find(GEP);
+  assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
+
+  if (!I->second.Full) {
+    Value *Result = nullptr;
+    B.SetInsertPoint(GEP);
+
+    if (I->second.VarIndex) {
+      Result = I->second.VarIndex;
+      Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
 
-  auto I = GEPIdx.find(GEP);
-  assert(I != GEPIdx.end() && "Must have entry for GEP!");
+      if (I->second.VarMul)
+        Result = B.CreateMul(Result, I->second.VarMul);
+    }
+
+    if (I->second.ConstIndex) {
+      if (Result)
+        Result = B.CreateAdd(Result, I->second.ConstIndex);
+      else
+        Result = I->second.ConstIndex;
+    }
+
+    if (!Result)
+      Result = B.getInt32(0);
+
+    I->second.Full = Result;
+  }
 
-  Value *IndexValue = I->second;
-  assert(IndexValue && "index value missing from GEP index map");
-  return IndexValue;
+  return I->second.Full;
 }
 
-static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
-                               Type *VecElemTy, const DataLayout &DL,
-                               SmallVector<Instruction *> &NewInsts) {
+static std::optional<GEPToVectorIndex>
+computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+                        Type *VecElemTy, const DataLayout &DL) {
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
   LLVMContext &Ctx = GEP->getContext();
@@ -431,7 +539,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   Value *CurPtr = GEP;
   while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
     if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
-      return nullptr;
+      return {};
 
     // Move to the next outer pointer.
     CurPtr = CurGEP->getPointerOperand();
@@ -441,126 +549,78 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
 
   int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
   if (VarOffsets.size() > 1)
-    return nullptr;
+    return {};
 
   APInt IndexQuot;
   int64_t Rem;
   APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
   if (Rem != 0)
-    return nullptr;
-  if (VarOffsets.size() == 0)
-    return ConstantInt::get(Ctx, IndexQuot);
+    return {};
 
-  IRBuilder<> Builder(GEP);
+  GEPToVectorIndex Result;
+
+  if (!ConstOffset.isZero())
+    Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
+
+  if (VarOffsets.empty())
+    return Result;
 
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
   APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
   if (Rem != 0 || OffsetQuot.isZero())
-    return nullptr;
+    return {};
 
-  Value *Offset = VarOffset.first;
-  auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
+  Result.VarIndex = VarOffset.first;
+  auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
   if (!OffsetType)
-    return nullptr;
+    return {};
 
-  if (!OffsetQuot.isOne()) {
-    ConstantInt *ConstMul =
-        ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
-    Offset = Builder.CreateMul(Offset, ConstMul);
-    if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
-      NewInsts.push_back(NewInst);
-  }
-  if (ConstOffset.isZero())
-    return Offset;
-
-  ConstantInt *ConstIndex =
-      ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
-  Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
-  if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
-    NewInsts.push_back(NewInst);
-  return IndexAdd;
+  if (!OffsetQuot.isOne())
+    Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
+
+  return Result;
 }
 
 /// Promotes a single user of the alloca to a vector form.
 ///
 /// \param Inst           Instruction to be promoted.
 /// \param DL             Module Data Layout.
-/// \param VectorTy       Vectorized Type.
+/// \param AA             Alloca Analysis.
 /// \param VecStoreSize   Size of \p VectorTy in bytes.
 /// \param ElementSize    Size of \p VectorTy element type in bytes.
-/// \param TransferInfo   MemTransferInst info map.
-/// \param GEPVectorIdx   GEP -> VectorIdx cache.
 /// \param CurVal         Current value of the vector (e.g. last stored value)
 /// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
 ///              be promoted now. This happens when promoting requires \p
 ///              CurVal, but \p CurVal is nullptr.
 /// \return the stored value if \p Inst would have written to the alloca, or
 ///         nullptr otherwise.
-static Value *promoteAllocaUserToVector(
-    Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
-    unsigned VecStoreSize, unsigned ElementSize,
-    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
-    std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
-    SmallVectorImpl<LoadInst *> &DeferredLoads) {
+static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
+                                        AllocaAnalysis &AA,
+                                        unsigned VecStoreSize,
+                                        unsigned ElementSize,
+                                        function_ref<Value *()> GetCurVal) {
   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
   // to do more folding, especially in the case of vector splats.
   IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
                                         InstSimplifyFolder(DL));
   Builder.SetInsertPoint(Inst);
 
-  const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
-    if (CurVal)
-      return CurVal;
-
-    // If the current value is not known, insert a dummy load and lower it on
-    // the second pass.
-    LoadInst *Dummy =
-        Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
-                           "promotealloca.dummyload");
-    DeferredLoads.push_back(Dummy);
-    return Dummy;
-  };
-
-  const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
-                                                   Type *PtrTy) -> Value * {
-    assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
-    const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
-    if (!PtrTy->isVectorTy())
-      return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
-    const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
-    // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
-    // first cast the ptr vector to <2 x i64>.
-    assert((Size % NumPtrElts == 0) && "Vector size not divisble");
-    Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
-    return Builder.CreateBitOrPointerCast(
-        Val, FixedVectorType::get(EltTy, NumPtrElts));
-  };
-
-  Type *VecEltTy = VectorTy->getElementType();
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
 
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
-    // Loads can only be lowered if the value is known.
-    if (!CurVal) {
-      DeferredLoads.push_back(cast<LoadInst>(Inst));
-      return nullptr;
-    }
-
-    Value *Index = calculateVectorIndex(
-        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+    Value *CurVal = GetCurVal();
+    Value *Index =
+        calculateVectorIndex(cast<LoadInst>(Inst)->getPointerOperand(), AA);
 
     // We're loading the full vector.
     Type *AccessTy = Inst->getType();
     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
     if (Constant *CI = dyn_cast<Constant>(Index)) {
       if (CI->isZeroValue() && AccessSize == VecStoreSize) {
-        if (AccessTy->isPtrOrPtrVectorTy())
-          CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
-        else if (CurVal->getType()->isPtrOrPtrVectorTy())
-          CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
-        Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
-        Inst->replaceAllUsesWith(NewVal);
+        Inst->replaceAllUsesWith(
+            Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy));
         return nullptr;
       }
     }
@@ -572,6 +632,36 @@ static Value *promoteAllocaUserToVector(
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
+      // If idx is dynamic, then sandwich load with bitcasts.
+      // ie. VectorTy                 SubVecTy  AccessTy
+      //     <64 x i8> ->             <16 x i8> <8 x i16>
+      //     <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
+      // Extracting subvector with dynamic index has very large expansion in
+      // the amdgpu backend. Limit to pow2.
+      FixedVectorType *VectorTy = AA.Vector.Ty;
+      TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u;
+      uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value();
+      bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
+      unsigned TotalNumElts = VectorTy->getNumElements();
+      bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
+      if (!isa<ConstantInt>(Index) &&
+          llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
+          IsProperlyDivisible && IsAlignedLoad) {
+        IntegerType *NewElemTy = Builder.getIntNTy(NumBits);
+        const unsigned NewNumElts =
+            DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
+        const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements());
+        FixedVectorType *BitCastTy =
+            FixedVectorType::get(NewElemTy, NewNumElts);
+        Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
+        Value *NewIdx = Builder.CreateLShr(
+            Index, ConstantInt::get(Index->getType(), LShrAmt));
+        Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
+        Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
+        Inst->replaceAllUsesWith(BCOut);
+        return nullptr;
+      }
+
       Value *SubVec = PoisonValue::get(SubVecTy);
       for (unsigned K = 0; K < NumLoadedElts; ++K) {
         Value *CurIdx =
@@ -580,13 +670,8 @@ static Value *promoteAllocaUserToVector(
             SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
       }
 
-      if (AccessTy->isPtrOrPtrVectorTy())
-        SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
-      else if (SubVecTy->isPtrOrPtrVectorTy())
-        SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
-
-      SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
-      Inst->replaceAllUsesWith(SubVec);
+      Inst->replaceAllUsesWith(
+          Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy));
       return nullptr;
     }
 
@@ -604,39 +689,27 @@ static Value *promoteAllocaUserToVector(
     // to know the current value. If this is a store of a single element, we
     // need to know the value.
     StoreInst *SI = cast<StoreInst>(Inst);
-    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+    Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
     Value *Val = SI->getValueOperand();
 
     // We're storing the full vector, we can handle this without knowing CurVal.
     Type *AccessTy = Val->getType();
     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
-    if (Constant *CI = dyn_cast<Constant>(Index)) {
-      if (CI->isZeroValue() && AccessSize == VecStoreSize) {
-        if (AccessTy->isPtrOrPtrVectorTy())
-          Val = CreateTempPtrIntCast(Val, AccessTy);
-        else if (VectorTy->isPtrOrPtrVectorTy())
-          Val = CreateTempPtrIntCast(Val, VectorTy);
-        return Builder.CreateBitOrPointerCast(Val, VectorTy);
-      }
-    }
+    if (Constant *CI = dyn_cast<Constant>(Index))
+      if (CI->isZeroValue() && AccessSize == VecStoreSize)
+        return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty);
 
     // Storing a subvector.
     if (isa<FixedVectorType>(AccessTy)) {
       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
       const unsigned NumWrittenElts =
           AccessSize / DL.getTypeStoreSize(VecEltTy);
-      const unsigned NumVecElts = VectorTy->getNumElements();
+      const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
-      if (SubVecTy->isPtrOrPtrVectorTy())
-        Val = CreateTempPtrIntCast(Val, SubVecTy);
-      else if (AccessTy->isPtrOrPtrVectorTy())
-        Val = CreateTempPtrIntCast(Val, AccessTy);
-
-      Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
-
-      Value *CurVec = GetOrLoadCurrentVectorValue();
+      Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy);
+      Value *CurVec = GetCurVal();
       for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
            K < NumElts; ++K) {
         Value *CurIdx =
@@ -649,22 +722,21 @@ static Value *promoteAllocaUserToVector(
 
     if (Val->getType() != VecEltTy)
       Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
-    return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
-                                       Index);
+    return Builder.CreateInsertElement(GetCurVal(), Val, Index);
   }
   case Instruction::Call: {
     if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
       // For memcpy, we need to know curval.
       ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
       unsigned NumCopied = Length->getZExtValue() / ElementSize;
-      MemTransferInfo *TI = &TransferInfo[MTI];
+      MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
       unsigned SrcBegin = TI->SrcIndex->getZExtValue();
       unsigned DestBegin = TI->DestIndex->getZExtValue();
 
       SmallVector<int> Mask;
-      for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+      for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
         if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
-          Mask.push_back(SrcBegin < VectorTy->getNumElements()
+          Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
                              ? SrcBegin++
                              : PoisonMaskElem);
         } else {
@@ -672,7 +744,7 @@ static Value *promoteAllocaUserToVector(
         }
       }
 
-      return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+      return Builder.CreateShuffleVector(GetCurVal(), Mask);
     }
 
     if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -693,14 +765,14 @@ static Value *promoteAllocaUserToVector(
           Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
       }
 
-      return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+      return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
     }
 
     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
         Intr->replaceAllUsesWith(
             Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
-                            DL.getTypeAllocSize(VectorTy)));
+                            DL.getTypeAllocSize(AA.Vector.Ty)));
         return nullptr;
       }
     }
@@ -791,16 +863,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
-// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
-
+FixedVectorType *
+AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
   if (DisablePromoteAllocaToVector) {
-    LLVM_DEBUG(dbgs() << "  Promote alloca to vector is disabled\n");
-    return false;
+    LLVM_DEBUG(dbgs() << "  Promote alloca to vectors is disabled\n");
+    return nullptr;
   }
 
-  Type *AllocaTy = Alloca.getAllocatedType();
   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
     uint64_t NumElems = 1;
@@ -832,10 +901,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       }
     }
   }
-
   if (!VectorTy) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
-    return false;
+    return nullptr;
   }
 
   const unsigned MaxElements =
@@ -845,46 +913,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  " << *VectorTy
                       << " has an unsupported number of elements\n");
-    return false;
+    return nullptr;
   }
 
-  std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
-  SmallVector<Instruction *> WorkList;
-  SmallVector<Instruction *> UsersToRemove;
-  SmallVector<Instruction *> DeferredInsts;
-  SmallVector<Instruction *> NewGEPInsts;
-  DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
-
-  const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
-    LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
-                      << "    " << *Inst << "\n");
-    for (auto *Inst : reverse(NewGEPInsts))
-      Inst->eraseFromParent();
-    return false;
-  };
-
-  SmallVector<Use *, 8> Uses;
-  collectAllocaUses(Alloca, Uses);
-
-  LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
-
   Type *VecEltTy = VectorTy->getElementType();
   unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
   if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
     LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
                          "does not match the type's size\n");
-    return false;
+    return nullptr;
   }
-  unsigned ElementSize = ElementSizeInBits / 8;
+
+  return VectorTy;
+}
+
+void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
+  if (AA.HaveSelectOrPHI) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector due to select or phi\n");
+    return;
+  }
+
+  Type *AllocaTy = AA.Alloca->getAllocatedType();
+  AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
+  if (!AA.Vector.Ty)
+    return;
+
+  const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+    LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
+                      << "    " << *Inst << "\n");
+    AA.Vector.Ty = nullptr;
+  };
+
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
+  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
   assert(ElementSize > 0);
-  for (auto *U : Uses) {
+  for (auto *U : AA.Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
-      // This is a store of the pointer, not to the pointer.
-      if (isa<StoreInst>(Inst) &&
-          U->getOperandNo() != StoreInst::getPointerOperandIndex())
-        return RejectUser(Inst, "pointer is being stored");
+      assert(!isa<StoreInst>(Inst) ||
+             U->getOperandNo() == StoreInst::getPointerOperandIndex());
 
       Type *AccessTy = getLoadStoreType(Inst);
       if (AccessTy->isAggregateType())
@@ -900,34 +968,35 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       Ptr = Ptr->stripPointerCasts();
 
       // Alloca already accessed as vector.
-      if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy)) {
-        WorkList.push_back(Inst);
+      if (Ptr == AA.Alloca &&
+          DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
+              DL->getTypeStoreSize(AccessTy)) {
+        AA.Vector.Worklist.push_back(Inst);
         continue;
       }
 
-      if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
+      if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
         return RejectUser(Inst, "not a supported access type");
 
-      WorkList.push_back(Inst);
+      AA.Vector.Worklist.push_back(Inst);
       continue;
     }
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+      auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
-      GEPVectorIdx[GEP] = Index;
-      UsersToRemove.push_back(Inst);
+      AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
-        MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
-      WorkList.push_back(Inst);
+        MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
+      AA.Vector.Worklist.push_back(Inst);
       continue;
     }
 
@@ -940,31 +1009,32 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "mem transfer inst length is non-constant or "
                                 "not a multiple of the vector element size");
 
-      if (TransferInfo.try_emplace(TransferInst).second) {
-        DeferredInsts.push_back(Inst);
-        WorkList.push_back(Inst);
-      }
+      auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
+        if (Ptr == AA.Alloca)
+          return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
 
-      auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
-        GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-        if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+        const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
+        if (GEPI.VarIndex)
           return nullptr;
-
-        return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
+        if (GEPI.ConstIndex)
+          return GEPI.ConstIndex;
+        return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
       };
 
+      MemTransferInfo *TI =
+          &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
       unsigned OpNum = U->getOperandNo();
-      MemTransferInfo *TI = &TransferInfo[TransferInst];
       if (OpNum == 0) {
         Value *Dest = TransferInst->getDest();
-        ConstantInt *Index = getPointerIndexOfAlloca(Dest);
+        ConstantInt *Index = getConstIndexIntoAlloca(Dest);
         if (!Index)
           return RejectUser(Inst, "could not calculate constant dest index");
         TI->DestIndex = Index;
       } else {
         assert(OpNum == 1);
         Value *Src = TransferInst->getSource();
-        ConstantInt *Index = getPointerIndexOfAlloca(Src);
+        ConstantInt *Index = getConstIndexIntoAlloca(Src);
         if (!Index)
           return RejectUser(Inst, "could not calculate constant src index");
         TI->SrcIndex = Index;
@@ -974,7 +1044,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
-        WorkList.push_back(Inst);
+        AA.Vector.Worklist.push_back(Inst);
         continue;
       }
     }
@@ -983,97 +1053,114 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (isAssumeLikeIntrinsic(Inst)) {
       if (!Inst->use_empty())
         return RejectUser(Inst, "assume-like intrinsic cannot have any users");
-      UsersToRemove.push_back(Inst);
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
           return isAssumeLikeIntrinsic(cast<Instruction>(U));
         })) {
-      UsersToRemove.push_back(Inst);
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     return RejectUser(Inst, "unhandled alloca user");
   }
 
-  while (!DeferredInsts.empty()) {
-    Instruction *Inst = DeferredInsts.pop_back_val();
-    MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
-    // TODO: Support the case if the pointers are from different alloca or
-    // from different address spaces.
-    MemTransferInfo &Info = TransferInfo[TransferInst];
-    if (!Info.SrcIndex || !Info.DestIndex)
-      return RejectUser(
-          Inst, "mem transfer inst is missing constant src and/or dst index");
+  // Follow-up check to ensure we've seen both sides of all transfer insts.
+  for (const auto &Entry : AA.Vector.TransferInfo) {
+    const MemTransferInfo &TI = Entry.second;
+    if (!TI.SrcIndex || !TI.DestIndex)
+      return RejectUser(Entry.first,
+                        "mem transfer inst between different objects");
+    AA.Vector.Worklist.push_back(Entry.first);
   }
+}
 
-  LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
-                    << *VectorTy << '\n');
-  const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
+void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
+  LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
+  LLVM_DEBUG(dbgs() << "  type conversion: " << *AA.Alloca->getAllocatedType()
+                    << " -> " << *AA.Vector.Ty << '\n');
+  const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
+
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
+  const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
 
   // Alloca is uninitialized memory. Imitate that by making the first value
   // undef.
   SSAUpdater Updater;
-  Updater.Initialize(VectorTy, "promotealloca");
+  Updater.Initialize(AA.Vector.Ty, "promotealloca");
 
-  BasicBlock *EntryBB = Alloca.getParent();
+  BasicBlock *EntryBB = AA.Alloca->getParent();
   BasicBlock::iterator InitInsertPos =
-      skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
-  // Alloca memory is undefined to begin, not poison.
-  Value *AllocaInitValue =
-      new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
-  AllocaInitValue->takeName(&Alloca);
+      skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
+  IRBuilder<> Builder(&*InitInsertPos);
+  Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
+  AllocaInitValue->takeName(AA.Alloca);
 
-  Updater.AddAvailableValue(EntryBB, AllocaInitValue);
+  Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
 
-  // First handle the initial worklist.
-  SmallVector<LoadInst *, 4> DeferredLoads;
-  forEachWorkListItem(WorkList, [&](Instruction *I) {
+  // First handle the initial worklist, in basic block order.
+  //
+  // Insert a placeholder whenever we need the vector value at the top of a
+  // basic block.
+  SmallVector<Instruction *> Placeholders;
+  forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
     BasicBlock *BB = I->getParent();
-    // On the first pass, we only take values that are trivially known, i.e.
-    // where AddAvailableValue was already called in this block.
-    Value *Result = promoteAllocaUserToVector(
-        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
-        Updater.FindValueForBlock(BB), DeferredLoads);
+    auto GetCurVal = [&]() -> Value * {
+      if (Value *CurVal = Updater.FindValueForBlock(BB))
+        return CurVal;
+
+      if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
+        return Placeholders.back();
+
+      // If the current value in the basic block is not yet known, insert a
+      // placeholder that we will replace later.
+      IRBuilder<> Builder(I);
+      auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
+          PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
+      Placeholders.push_back(Placeholder);
+      return Placeholders.back();
+    };
+
+    Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
+                                              ElementSize, GetCurVal);
     if (Result)
       Updater.AddAvailableValue(BB, Result);
   });
 
-  // Then handle deferred loads.
-  forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
-    SmallVector<LoadInst *, 0> NewDLs;
-    BasicBlock *BB = I->getParent();
-    // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
-    // get a value, inserting PHIs as needed.
-    Value *Result = promoteAllocaUserToVector(
-        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
-        Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
-    if (Result)
-      Updater.AddAvailableValue(BB, Result);
-    assert(NewDLs.empty() && "No more deferred loads should be queued!");
-  });
+  // Now fixup the placeholders.
+  SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
+  for (auto [Index, Placeholder] : enumerate(Placeholders)) {
+    Value *NewVal = Updater.GetValueInMiddleOfBlock(Placeholder->getParent());
+    PlaceholderToNewVal[Index] = NewVal;
+    Placeholder->replaceAllUsesWith(NewVal);
+  }
+  // Note: we cannot merge this loop with the previous one because it is
+  // possible that the placeholder itself can be used in the SSAUpdater. The
+  // replaceAllUsesWith doesn't replace those uses.
+  for (auto [Index, Placeholder] : enumerate(Placeholders)) {
+    if (!Placeholder->use_empty())
+      Placeholder->replaceAllUsesWith(PlaceholderToNewVal[Index]);
+    Placeholder->eraseFromParent();
+  }
 
-  // Delete all instructions. On the first pass, new dummy loads may have been
-  // added so we need to collect them too.
-  DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
-  InstsToDelete.insert_range(DeferredLoads);
-  for (Instruction *I : InstsToDelete) {
+  // Delete all instructions.
+  for (Instruction *I : AA.Vector.Worklist) {
     assert(I->use_empty());
     I->eraseFromParent();
   }
 
   // Delete all the users that are known to be removeable.
-  for (Instruction *I : reverse(UsersToRemove)) {
+  for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
     I->dropDroppableUses();
     assert(I->use_empty());
     I->eraseFromParent();
   }
 
   // Alloca should now be dead too.
-  assert(Alloca.use_empty());
-  Alloca.eraseFromParent();
-  return true;
+  assert(AA.Alloca->use_empty());
+  AA.Alloca->eraseFromParent();
 }
 
 std::pair<Value *, Value *>
@@ -1247,61 +1334,78 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
   return true;
 }
 
-bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
-    Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
+void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
+  if (DisablePromoteAllocaToLDS) {
+    LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
+    return;
+  }
 
-  for (User *User : Val->users()) {
-    if (is_contained(WorkList, User))
-      continue;
+  // Don't promote the alloca to LDS for shader calling conventions as the work
+  // item ID intrinsics are not supported for these calling conventions.
+  // Furthermore not all LDS is available for some of the stages.
+  const Function &ContainingFunction = *AA.Alloca->getFunction();
+  CallingConv::ID CC = ContainingFunction.getCallingConv();
+
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    break;
+  default:
+    LLVM_DEBUG(
+        dbgs()
+        << "  promote alloca to LDS not supported with calling convention.\n");
+    return;
+  }
+
+  for (Use *Use : AA.Uses) {
+    auto *User = Use->getUser();
 
     if (CallInst *CI = dyn_cast<CallInst>(User)) {
       if (!isCallPromotable(CI))
-        return false;
+        return;
 
-      WorkList.push_back(User);
+      if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+        AA.LDS.Worklist.push_back(User);
       continue;
     }
 
     Instruction *UseInst = cast<Instruction>(User);
     if (UseInst->getOpcode() == Instruction::PtrToInt)
-      return false;
+      return;
 
     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
       if (LI->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
       if (SI->isVolatile())
-        return false;
-
-      // Reject if the stored value is not the pointer operand.
-      if (SI->getPointerOperand() != Val)
-        return false;
+        return;
       continue;
     }
 
     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
       if (RMW->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
       if (CAS->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     // Only promote a select if we know that the other select operand
     // is from another pointer that will also be promoted.
     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
-        return false;
+      if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
+        return;
 
       // May need to rewrite constant operands.
-      WorkList.push_back(ICmp);
+      if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+        AA.LDS.Worklist.push_back(ICmp);
       continue;
     }
 
@@ -1309,28 +1413,8 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       // Be conservative if an address could be computed outside the bounds of
       // the alloca.
       if (!GEP->isInBounds())
-        return false;
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
-      // Only promote a select if we know that the other select operand is from
-      // another pointer that will also be promoted.
-      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
-        return false;
-    } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
-      // Repeat for phis.
-
-      // TODO: Handle more complex cases. We should be able to replace loops
-      // over arrays.
-      switch (Phi->getNumIncomingValues()) {
-      case 1:
-        break;
-      case 2:
-        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
-          return false;
-        break;
-      default:
-        return false;
-      }
-    } else if (!isa<ExtractElementInst>(User)) {
+        return;
+    } else if (!isa<ExtractElementInst, SelectInst, PHINode>(User)) {
       // Do not promote vector/aggregate type instructions. It is hard to track
       // their users.
 
@@ -1338,15 +1422,14 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       //
       // TODO: If we know the address is only observed through flat pointers, we
       // could still promote.
-      return false;
+      return;
     }
 
-    WorkList.push_back(User);
-    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
-      return false;
+    if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+      AA.LDS.Worklist.push_back(User);
   }
 
-  return true;
+  AA.LDS.Enable = true;
 }
 
 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
@@ -1378,7 +1461,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
   auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
     for (const User *U : Val->users()) {
       if (const Instruction *Use = dyn_cast<Instruction>(U)) {
-        if (Use->getParent()->getParent() == &F)
+        if (Use->getFunction() == &F)
           return true;
       } else {
         const Constant *C = cast<Constant>(U);
@@ -1419,7 +1502,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
   for (const GlobalVariable *GV : UsedLDS) {
     Align Alignment =
         DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
-    uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+    uint64_t AllocSize = GV->getGlobalSize(DL);
 
     // HIP uses an extern unsized array in local address space for dynamically
     // allocated shared memory.  In that case, we have to disable the promotion.
@@ -1477,44 +1560,24 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
 }
 
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
-                                                    bool SufficientLDS) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
-
-  if (DisablePromoteAllocaToLDS) {
-    LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
-    return false;
-  }
-
-  const DataLayout &DL = Mod->getDataLayout();
-  IRBuilder<> Builder(&I);
-
-  const Function &ContainingFunction = *I.getParent()->getParent();
-  CallingConv::ID CC = ContainingFunction.getCallingConv();
-
-  // Don't promote the alloca to LDS for shader calling conventions as the work
-  // item ID intrinsics are not supported for these calling conventions.
-  // Furthermore not all LDS is available for some of the stages.
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    break;
-  default:
-    LLVM_DEBUG(
-        dbgs()
-        << " promote alloca to LDS not supported with calling convention.\n");
-    return false;
-  }
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
+    AllocaAnalysis &AA, bool SufficientLDS,
+    SetVector<IntrinsicInst *> &DeferredIntrs) {
+  LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
 
   // Not likely to have sufficient local memory for promotion.
   if (!SufficientLDS)
     return false;
 
+  const DataLayout &DL = Mod->getDataLayout();
+  IRBuilder<> Builder(AA.Alloca);
+
+  const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
-  Align Alignment =
-      DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
+  Align Alignment = DL.getValueOrABITypeAlignment(
+      AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
 
   // FIXME: This computed padding is likely wrong since it depends on inverse
   // usage order.
@@ -1524,7 +1587,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
   uint32_t AllocSize =
-      WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
+      WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
   NewSize += AllocSize;
 
   if (NewSize > LocalMemLimit) {
@@ -1535,24 +1598,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   CurrentLocalMemUsage = NewSize;
 
-  std::vector<Value *> WorkList;
-
-  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
-    LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return false;
-  }
-
   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
 
-  Function *F = I.getParent()->getParent();
+  Function *F = AA.Alloca->getFunction();
 
-  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
+  Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
   GlobalVariable *GV = new GlobalVariable(
       *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
-      Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
+      Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(I.getAlign());
+  GV->setAlignment(AA.Alloca->getAlign());
 
   Value *TCntY, *TCntZ;
 
@@ -1571,15 +1627,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
   Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
 
   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
-  I.mutateType(Offset->getType());
-  I.replaceAllUsesWith(Offset);
-  I.eraseFromParent();
-
-  SmallVector<IntrinsicInst *> DeferredIntrs;
+  AA.Alloca->mutateType(Offset->getType());
+  AA.Alloca->replaceAllUsesWith(Offset);
+  AA.Alloca->eraseFromParent();
 
   PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
 
-  for (Value *V : WorkList) {
+  for (Value *V : AA.LDS.Worklist) {
     CallInst *Call = dyn_cast<CallInst>(V);
     if (!Call) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
@@ -1637,7 +1691,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
       // These have 2 pointer operands. In case if second pointer also needs
       // to be replaced we defer processing of these intrinsics until all
       // other values are processed.
-      DeferredIntrs.push_back(Intr);
+      DeferredIntrs.insert(Intr);
       continue;
     case Intrinsic::memset: {
       MemSetInst *MemSet = cast<MemSetInst>(Intr);
@@ -1685,7 +1739,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
     }
   }
 
+  return true;
+}
+
+void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
+    SetVector<IntrinsicInst *> &DeferredIntrs) {
+
   for (IntrinsicInst *Intr : DeferredIntrs) {
+    IRBuilder<> Builder(Intr);
     Builder.SetInsertPoint(Intr);
     Intrinsic::ID ID = Intr->getIntrinsicID();
     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
@@ -1703,6 +1764,4 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
     Intr->eraseFromParent();
   }
-
-  return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index fd604e1..e2e84ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -333,7 +333,7 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
   Register Val = Src0->getOperand(0).getReg();
 
   auto isOp3Zero = [&]() {
-    MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+    MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
     if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
       return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
     return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index e187959..888717f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -34,9 +35,17 @@
 
 using namespace llvm;
 using namespace AMDGPU;
+using namespace llvm::MIPatternMatch;
 
 namespace {
 
+// AMDGPU-specific pattern matchers
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
+m_GAMDGPUReadAnyLane(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
+}
+
 class AMDGPURegBankLegalize : public MachineFunctionPass {
 public:
   static char ID;
@@ -119,8 +128,9 @@ public:
 
   bool isLaneMask(Register Reg);
   std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
-  std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
-  Register getReadAnyLaneSrc(Register Src);
+  Register tryMatchUnmergeDefs(SmallVectorImpl<Register> &DefRegs);
+  SmallVector<Register> tryMatchMergeReadAnyLane(GMergeLikeInstr *Merge);
+  SmallVector<Register> getReadAnyLaneSrcs(Register Src);
   void replaceRegWithOrBuildCopy(Register Dst, Register Src);
 
   bool tryEliminateReadAnyLane(MachineInstr &Copy);
@@ -145,43 +155,74 @@ AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
   return {MatchMI, MatchMI->getOperand(1).getReg()};
 }
 
-std::pair<GUnmerge *, int>
-AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
-  MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
-  if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
-    return {nullptr, -1};
-
-  Register RALSrc = ReadAnyLane->getOperand(1).getReg();
-  if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
-    return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
+// Check if all registers are from same unmerge and there is no shuffling.
+// Returns the unmerge source if both conditions are met.
+Register AMDGPURegBankLegalizeCombiner::tryMatchUnmergeDefs(
+    SmallVectorImpl<Register> &DefRegs) {
+  auto *UnMerge = getOpcodeDef<GUnmerge>(DefRegs[0], MRI);
+  if (!UnMerge || UnMerge->getNumDefs() != DefRegs.size())
+    return {};
+  for (unsigned I = 1; I < DefRegs.size(); ++I) {
+    if (UnMerge->getReg(I) != DefRegs[I])
+      return {};
+  }
+  return UnMerge->getSourceReg();
+}
 
-  return {nullptr, -1};
+// Check if all merge sources are readanylanes and return the readanylane
+// sources if they are.
+SmallVector<Register> AMDGPURegBankLegalizeCombiner::tryMatchMergeReadAnyLane(
+    GMergeLikeInstr *Merge) {
+  SmallVector<Register> ReadAnyLaneSrcs;
+  for (unsigned i = 0; i < Merge->getNumSources(); ++i) {
+    Register Src;
+    if (!mi_match(Merge->getSourceReg(i), MRI,
+                  m_GAMDGPUReadAnyLane(m_Reg(Src))))
+      return {};
+    ReadAnyLaneSrcs.push_back(Src);
+  }
+  return ReadAnyLaneSrcs;
 }
 
-Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+SmallVector<Register>
+AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrcs(Register Src) {
   // Src = G_AMDGPU_READANYLANE RALSrc
-  auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
-  if (RAL)
-    return RALSrc;
-
-  // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
-  // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
-  // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
-  // Src G_MERGE_VALUES LoSgpr, HiSgpr
-  auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
-  if (Merge) {
-    unsigned NumElts = Merge->getNumSources();
-    auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
-    if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+  Register RALSrc;
+  if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
+    return {RALSrc};
+
+  // RALSrc = G_ANYEXT S16Src
+  // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+  // Src = G_TRUNC TruncSrc
+  if (mi_match(Src, MRI,
+               m_GTrunc(m_GAMDGPUReadAnyLane(m_GAnyExt(m_Reg(RALSrc)))))) {
+    return {RALSrc};
+  }
+
+  // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+  // AextSrc = G_TRUNC TruncSrc
+  // Src = G_ANYEXT AextSrc
+  if (mi_match(Src, MRI,
+               m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
+    return {RALSrc};
+  }
+
+  // Sgpr0 = G_AMDGPU_READANYLANE Vgpr0
+  // Sgpr1 = G_AMDGPU_READANYLANE Vgpr1
+  // ...
+  // Src = G_MERGE_LIKE Sgpr0, Sgpr1, ...
+  // Dst = COPY Src
+  if (auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI)) {
+    SmallVector<Register> ReadAnyLaneSrcs = tryMatchMergeReadAnyLane(Merge);
+    if (ReadAnyLaneSrcs.empty())
       return {};
 
-    // Check if all elements are from same unmerge and there is no shuffling.
-    for (unsigned i = 1; i < NumElts; ++i) {
-      auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
-      if (UnmergeI != Unmerge || (unsigned)IdxI != i)
-        return {};
-    }
-    return Unmerge->getSourceReg();
+    // Vgpr0, Vgpr1, ... = G_UNMERGE_VALUES UnmergeSrc
+    if (Register UnmergeSrc = tryMatchUnmergeDefs(ReadAnyLaneSrcs))
+      return {UnmergeSrc};
+
+    // Multiple ReadAnyLane vgpr sources, need to merge Vgpr0, Vgpr1, ...
+    return ReadAnyLaneSrcs;
   }
 
   // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc
@@ -192,7 +233,7 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
     return {};
 
   int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
-  Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+  auto *Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
   if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
     return {};
 
@@ -202,7 +243,7 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
 
   auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
   if (RALEl)
-    return RALElSrc;
+    return {RALElSrc};
 
   return {};
 }
@@ -234,17 +275,27 @@ bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
   if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
     RALDst = SrcMI.getOperand(1).getReg();
 
-  Register RALSrc = getReadAnyLaneSrc(RALDst);
-  if (!RALSrc)
+  B.setInstrAndDebugLoc(Copy);
+  SmallVector<Register> ReadAnyLaneSrcRegs = getReadAnyLaneSrcs(RALDst);
+  if (ReadAnyLaneSrcRegs.empty())
     return false;
 
-  B.setInstr(Copy);
+  Register ReadAnyLaneSrc;
+  if (ReadAnyLaneSrcRegs.size() == 1) {
+    ReadAnyLaneSrc = ReadAnyLaneSrcRegs[0];
+  } else {
+    // Multiple readanylane sources without a common unmerge, merge them.
+    auto Merge = B.buildMergeLikeInstr({VgprRB, MRI.getType(RALDst)},
+                                       ReadAnyLaneSrcRegs);
+    ReadAnyLaneSrc = Merge.getReg(0);
+  }
+
   if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
     // Src = READANYLANE RALSrc     Src = READANYLANE RALSrc
     // Dst = Copy Src               $Dst = Copy Src
     // ->                           ->
     // Dst = RALSrc                 $Dst = Copy RALSrc
-    replaceRegWithOrBuildCopy(Dst, RALSrc);
+    replaceRegWithOrBuildCopy(Dst, ReadAnyLaneSrc);
   } else {
     // RALDst = READANYLANE RALSrc  RALDst = READANYLANE RALSrc
     // Src = G_BITCAST RALDst       Src = G_BITCAST RALDst
@@ -252,7 +303,7 @@ bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
     // ->                          ->
     // NewVgpr = G_BITCAST RALDst   NewVgpr = G_BITCAST RALDst
     // Dst = NewVgpr                $Dst = Copy NewVgpr
-    auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+    auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, ReadAnyLaneSrc);
     replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
   }
 
@@ -410,21 +461,15 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
     unsigned Opc = MI->getOpcode();
     // Insert point for use operands needs some calculation.
     if (Opc == AMDGPU::G_PHI) {
-      RBLHelper.applyMappingPHI(*MI);
+      if (!RBLHelper.applyMappingPHI(*MI))
+        return false;
       continue;
     }
 
     // Opcodes that support pretty much all combinations of reg banks and LLTs
     // (except S1). There is no point in writing rules for them.
-    if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
-        Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
-      RBLHelper.applyMappingTrivial(*MI);
-      continue;
-    }
-
-    // Opcodes that also support S1.
-    if (Opc == G_FREEZE &&
-        MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
+    if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_MERGE_VALUES ||
+        Opc == AMDGPU::G_CONCAT_VECTORS || Opc == AMDGPU::G_BITCAST) {
       RBLHelper.applyMappingTrivial(*MI);
       continue;
     }
@@ -441,7 +486,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
       // S1 rules are in RegBankLegalizeRules.
     }
 
-    RBLHelper.findRuleAndApplyMapping(*MI);
+    if (!RBLHelper.findRuleAndApplyMapping(*MI))
+      return false;
   }
 
   // Sgpr S1 clean up combines:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 5407566..d262f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -19,6 +19,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -32,28 +33,48 @@ using namespace AMDGPU;
 RegBankLegalizeHelper::RegBankLegalizeHelper(
     MachineIRBuilder &B, const MachineUniformityInfo &MUI,
     const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
-    : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
-      MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
+    : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
+      MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
+      RBLRules(RBLRules), IsWave32(ST.isWave32()),
       SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
       VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
       VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
 
-void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
-  const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
-  const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
+bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
+  const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
+  if (!RuleSet) {
+    reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                       "No AMDGPU RegBankLegalize rules defined for opcode",
+                       MI);
+    return false;
+  }
+
+  const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
+  if (!Mapping) {
+    reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                       "AMDGPU RegBankLegalize: none of the rules defined with "
+                       "'Any' for MI's opcode matched MI",
+                       MI);
+    return false;
+  }
 
   SmallSet<Register, 4> WaterfallSgprs;
   unsigned OpIdx = 0;
-  if (Mapping.DstOpMapping.size() > 0) {
+  if (Mapping->DstOpMapping.size() > 0) {
     B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
-    applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
+    if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
+      return false;
   }
-  if (Mapping.SrcOpMapping.size() > 0) {
+  if (Mapping->SrcOpMapping.size() > 0) {
     B.setInstr(MI);
-    applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
+    if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WaterfallSgprs))
+      return false;
   }
 
-  lower(MI, Mapping, WaterfallSgprs);
+  if (!lower(MI, *Mapping, WaterfallSgprs))
+    return false;
+
+  return true;
 }
 
 bool RegBankLegalizeHelper::executeInWaterfallLoop(
@@ -274,7 +295,7 @@ bool RegBankLegalizeHelper::executeInWaterfallLoop(
   return true;
 }
 
-void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
+bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
                                       ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
   MachineFunction &MF = B.getMF();
   assert(MI.getNumMemOperands() == 1);
@@ -322,9 +343,10 @@ void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
     B.buildMergeLikeInstr(Dst, MergeTyParts);
   }
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
+bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
                                       LLT MergeTy) {
   MachineFunction &MF = B.getMF();
   assert(MI.getNumMemOperands() == 1);
@@ -350,9 +372,10 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
     B.buildMergeLikeInstr(Dst, MergeTyParts);
   }
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
+bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
   Register Dst = MI.getDstReg();
   Register Ptr = MI.getPointerReg();
   MachineMemOperand &MMO = MI.getMMO();
@@ -376,9 +399,10 @@ void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
   }
 
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(Dst);
   Register Src = MI.getOperand(1).getReg();
@@ -404,15 +428,22 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
       Hi = B.buildUndef({VgprRB_S32});
       break;
     default:
-      llvm_unreachable("Opcode not supported");
+      reportGISelFailure(
+          MF, MORE, "amdgpu-regbanklegalize",
+          "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
+      return false;
     }
 
     B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
   } else {
-    llvm_unreachable("Type not supported");
+    reportGISelFailure(
+        MF, MORE, "amdgpu-regbanklegalize",
+        "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
+    return false;
   }
 
   MI.eraseFromParent();
+  return true;
 }
 
 std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
@@ -437,7 +468,14 @@ std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
   return {Lo.getReg(0), Hi.getReg(0)};
 }
 
-void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
+std::pair<Register, Register>
+RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
+  auto [Lo32, Hi32] = unpackAExt(Reg);
+  return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
+          B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
+}
+
+bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
   Register Lo, Hi;
   switch (MI.getOpcode()) {
   case AMDGPU::G_SHL: {
@@ -462,13 +500,18 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
     break;
   }
   default:
-    llvm_unreachable("Unpack lowering not implemented");
+    reportGISelFailure(
+        MF, MORE, "amdgpu-regbanklegalize",
+        "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
+        MI);
+    return false;
   }
   B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
   Register Lo, Hi;
   switch (MI.getOpcode()) {
   case AMDGPU::G_SMIN:
@@ -494,10 +537,25 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
     break;
   }
   default:
-    llvm_unreachable("Unpack min/max lowering not implemented");
+    reportGISelFailure(
+        MF, MORE, "amdgpu-regbanklegalize",
+        "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
+    return false;
   }
   B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
   MI.eraseFromParent();
+  return true;
+}
+
+bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
+  auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
+  auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
+  auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
+  auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
+  B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
+                          {ResLo.getReg(0), ResHi.getReg(0)});
+  MI.eraseFromParent();
+  return true;
 }
 
 static bool isSignedBFE(MachineInstr &MI) {
@@ -507,7 +565,7 @@ static bool isSignedBFE(MachineInstr &MI) {
   return MI.getOpcode() == AMDGPU::G_SBFX;
 }
 
-void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   assert(MRI.getType(Dst) == LLT::scalar(64));
   bool Signed = isSignedBFE(MI);
@@ -534,7 +592,7 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
     auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
     B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
     MI.eraseFromParent();
-    return;
+    return true;
   }
 
   uint64_t WidthImm = ConstWidth->Value.getZExtValue();
@@ -564,9 +622,10 @@ void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
   }
 
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(DstReg);
   bool Signed = isSignedBFE(MI);
@@ -591,15 +650,15 @@ void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
   // copies from reg class to reg bank.
   auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
                             {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
-  if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
-                                        *ST.getRegisterInfo(), RBI))
-    llvm_unreachable("failed to constrain BFE");
+  constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
+                                   *ST.getRegisterInfo(), RBI);
 
   B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(Dst);
   assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
@@ -614,9 +673,113 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
       B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
   B.buildMergeLikeInstr(Dst, {Lo, Hi});
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  assert(MRI.getType(Dst) == S64);
+  auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
+  auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
+
+  // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
+  // match GlobalISel with old regbankselect.
+  auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
+  auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
+  auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
+  auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
+  auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
+  auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
+
+  B.buildMergeLikeInstr(Dst, {Lo, Hi});
+  MI.eraseFromParent();
+  return true;
+}
+
+bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  assert(MRI.getType(Dst) == V2S16);
+  unsigned Opc = MI.getOpcode();
+  unsigned NumOps = MI.getNumOperands();
+  auto Flags = MI.getFlags();
+
+  auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+
+  if (NumOps == 2) {
+    auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
+    auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
+    B.buildMergeLikeInstr(Dst, {Lo, Hi});
+    MI.eraseFromParent();
+    return true;
+  }
+
+  auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
+
+  if (NumOps == 3) {
+    auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
+    auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
+    B.buildMergeLikeInstr(Dst, {Lo, Hi});
+    MI.eraseFromParent();
+    return true;
+  }
+
+  assert(NumOps == 4);
+  auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
+  auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
+  auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
+  B.buildMergeLikeInstr(Dst, {Lo, Hi});
+  MI.eraseFromParent();
+  return true;
+}
+
+bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
+  Register Dst0 = MI.getOperand(0).getReg();
+  Register Dst1 = MI.getOperand(1).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+  Register Src1 = MI.getOperand(3).getReg();
+  Register Src2 = MI.getOperand(4).getReg();
+
+  const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
+
+  // Keep the multiplication on the SALU.
+  Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
+  Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
+  if (ST.hasScalarMulHiInsts()) {
+    B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
+  } else {
+    auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
+    auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
+    auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
+    buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
+  }
+
+  // Accumulate and produce the "carry-out" bit.
+
+  // The "carry-out" is defined as bit 64 of the result when computed as a
+  // big integer. For unsigned multiply-add, this matches the usual
+  // definition of carry-out.
+  if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
+    // No accumulate: result is just the multiplication, carry is 0.
+    B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
+    B.buildConstant(Dst1, 0);
+  } else {
+    // Accumulate: add Src2 to the multiplication result with carry chain.
+    Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
+    Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
+    B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
+
+    auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
+    auto AddHi =
+        B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
+    B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
+    B.buildCopy(Dst1, AddHi.getReg(1));
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(Dst);
   assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
@@ -633,9 +796,10 @@ void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
 
   B.buildMergeLikeInstr(Dst, {Lo, Hi});
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
+bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
   auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
   int Amt = MI.getOperand(2).getImm();
   Register Lo, Hi;
@@ -660,9 +824,10 @@ void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
 
   B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
   MI.eraseFromParent();
+  return true;
 }
 
-void RegBankLegalizeHelper::lower(MachineInstr &MI,
+bool RegBankLegalizeHelper::lower(MachineInstr &MI,
                                   const RegBankLLTMapping &Mapping,
                                   SmallSet<Register, 4> &WaterfallSgprs) {
 
@@ -682,12 +847,14 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
                   False);
     MI.eraseFromParent();
-    return;
+    return true;
   }
   case UnpackBitShift:
     return lowerUnpackBitShift(MI);
   case UnpackMinMax:
     return lowerUnpackMinMax(MI);
+  case ScalarizeToS16:
+    return lowerSplitTo16(MI);
   case Ext32To64: {
     const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
     MachineInstrBuilder Hi;
@@ -707,20 +874,23 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
       break;
     }
     default:
-      llvm_unreachable("Unsuported Opcode in Ext32To64");
+      reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                         "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
+                         MI);
+      return false;
     }
 
     B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
                           {MI.getOperand(1).getReg(), Hi});
     MI.eraseFromParent();
-    return;
+    return true;
   }
   case UniCstExt: {
     uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
     B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
 
     MI.eraseFromParent();
-    return;
+    return true;
   }
   case VgprToVccCopy: {
     Register Src = MI.getOperand(1).getReg();
@@ -744,14 +914,37 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     auto Zero = B.buildConstant({VgprRB, Ty}, 0);
     B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
     MI.eraseFromParent();
-    return;
+    return true;
   }
   case V_BFE:
     return lowerV_BFE(MI);
   case S_BFE:
     return lowerS_BFE(MI);
+  case UniMAD64:
+    return lowerUniMAD64(MI);
+  case UniMul64: {
+    B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
+    MI.eraseFromParent();
+    return true;
+  }
+  case DivSMulToMAD: {
+    auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
+    auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
+    auto Zero = B.buildConstant({VgprRB, S64}, 0);
+
+    unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
+                          ? AMDGPU::G_AMDGPU_MAD_U64_U32
+                          : AMDGPU::G_AMDGPU_MAD_I64_I32;
+
+    B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
+                 {Op1, Op2, Zero});
+    MI.eraseFromParent();
+    return true;
+  }
   case SplitTo32:
     return lowerSplitTo32(MI);
+  case SplitTo32Mul:
+    return lowerSplitTo32Mul(MI);
   case SplitTo32Select:
     return lowerSplitTo32Select(MI);
   case SplitTo32SExtInReg:
@@ -773,8 +966,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
       else if (Size / 128 == 4)
         splitLoad(MI, {B128, B128, B128, B128});
       else {
-        LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
-        llvm_unreachable("SplitLoad type not supported for MI");
+        reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                           "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
+                           MI);
+        return false;
       }
     }
     // 64 and 32 bit load
@@ -785,10 +980,12 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     else if (DstTy == V6S16)
       splitLoad(MI, {V4S16, V2S16}, V2S16);
     else {
-      LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
-      llvm_unreachable("SplitLoad type not supported for MI");
+      reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                         "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
+                         MI);
+      return false;
     }
-    break;
+    return true;
   }
   case WidenLoad: {
     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
@@ -799,19 +996,74 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     else if (DstTy == V6S16)
       widenLoad(MI, V8S16, V2S16);
     else {
-      LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
-      llvm_unreachable("WidenLoad type not supported for MI");
+      reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                         "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
+                         MI);
+      return false;
     }
-    break;
+    return true;
   }
+  case UnpackAExt:
+    return lowerUnpackAExt(MI);
   case WidenMMOToS32:
     return widenMMOToS32(cast<GAnyLoad>(MI));
+  case VerifyAllSgpr: {
+    assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
+      return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
+    }));
+    return true;
+  }
+  case ApplyAllVgpr: {
+    assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
+      return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
+    }));
+    B.setInstrAndDebugLoc(MI);
+    for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
+      Register Reg = MI.getOperand(i).getReg();
+      if (MRI.getRegBank(Reg) != VgprRB) {
+        auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
+        MI.getOperand(i).setReg(Copy.getReg(0));
+      }
+    }
+    return true;
+  }
+  case UnmergeToShiftTrunc: {
+    GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
+    LLT Ty = MRI.getType(Unmerge->getSourceReg());
+    if (Ty.getSizeInBits() % 32 != 0) {
+      reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                         "AMDGPU RegBankLegalize: unmerge not multiple of 32",
+                         MI);
+      return false;
+    }
+
+    B.setInstrAndDebugLoc(MI);
+    if (Ty.getSizeInBits() > 32) {
+      auto UnmergeV2S16 =
+          B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
+      for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
+        auto [Dst0S32, Dst1S32] =
+            unpackAExt(UnmergeV2S16->getOperand(i).getReg());
+        B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
+        B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
+      }
+    } else {
+      auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
+      B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
+      B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
   }
 
   if (!WaterfallSgprs.empty()) {
     MachineBasicBlock::iterator I = MI.getIterator();
-    executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+    if (!executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs))
+      return false;
   }
+  return true;
 }
 
 LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -832,20 +1084,26 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
   case Sgpr32ZExt:
   case UniInVgprS32:
   case Vgpr32:
+  case Vgpr32AExt:
   case Vgpr32SExt:
   case Vgpr32ZExt:
     return LLT::scalar(32);
   case Sgpr64:
   case Vgpr64:
+  case UniInVgprS64:
     return LLT::scalar(64);
   case Sgpr128:
   case Vgpr128:
     return LLT::scalar(128);
+  case SgprP0:
   case VgprP0:
     return LLT::pointer(0, 64);
   case SgprP1:
   case VgprP1:
     return LLT::pointer(1, 64);
+  case SgprP2:
+  case VgprP2:
+    return LLT::pointer(2, 32);
   case SgprP3:
   case VgprP3:
     return LLT::pointer(3, 32);
@@ -855,18 +1113,26 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
   case SgprP5:
   case VgprP5:
     return LLT::pointer(5, 32);
+  case SgprP8:
+    return LLT::pointer(8, 128);
   case SgprV2S16:
   case VgprV2S16:
   case UniInVgprV2S16:
     return LLT::fixed_vector(2, 16);
   case SgprV2S32:
   case VgprV2S32:
+  case UniInVgprV2S32:
     return LLT::fixed_vector(2, 32);
+  case VgprV3S32:
+    return LLT::fixed_vector(3, 32);
   case SgprV4S32:
   case SgprV4S32_WF:
   case VgprV4S32:
   case UniInVgprV4S32:
     return LLT::fixed_vector(4, 32);
+  case VgprV2S64:
+  case UniInVgprV2S64:
+    return LLT::fixed_vector(2, 64);
   default:
     return LLT();
   }
@@ -908,7 +1174,13 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
   case VgprB128:
   case UniInVgprB128:
     if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
-        Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
+        Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
+        isAnyPtr(Ty, 128))
+      return Ty;
+    return LLT();
+  case VgprB160:
+  case UniInVgprB160:
+    if (Ty.getSizeInBits() == 160)
       return Ty;
     return LLT();
   case SgprB256:
@@ -925,6 +1197,21 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
         Ty == LLT::fixed_vector(8, 64))
       return Ty;
     return LLT();
+  case SgprBRC: {
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+    unsigned LLTSize = Ty.getSizeInBits();
+    if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
+      return Ty;
+    return LLT();
+  }
+  case VgprBRC: {
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+    if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
+      return Ty;
+    return LLT();
+  }
   default:
     return LLT();
   }
@@ -940,10 +1227,13 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case Sgpr32_WF:
   case Sgpr64:
   case Sgpr128:
+  case SgprP0:
   case SgprP1:
+  case SgprP2:
   case SgprP3:
   case SgprP4:
   case SgprP5:
+  case SgprP8:
   case SgprPtr32:
   case SgprPtr64:
   case SgprPtr128:
@@ -957,15 +1247,20 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case SgprB128:
   case SgprB256:
   case SgprB512:
+  case SgprBRC:
   case UniInVcc:
   case UniInVgprS16:
   case UniInVgprS32:
+  case UniInVgprS64:
   case UniInVgprV2S16:
+  case UniInVgprV2S32:
   case UniInVgprV4S32:
+  case UniInVgprV2S64:
   case UniInVgprB32:
   case UniInVgprB64:
   case UniInVgprB96:
   case UniInVgprB128:
+  case UniInVgprB160:
   case UniInVgprB256:
   case UniInVgprB512:
   case Sgpr32Trunc:
@@ -980,6 +1275,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case Vgpr128:
   case VgprP0:
   case VgprP1:
+  case VgprP2:
   case VgprP3:
   case VgprP4:
   case VgprP5:
@@ -988,13 +1284,18 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case VgprPtr128:
   case VgprV2S16:
   case VgprV2S32:
+  case VgprV2S64:
+  case VgprV3S32:
   case VgprV4S32:
   case VgprB32:
   case VgprB64:
   case VgprB96:
   case VgprB128:
+  case VgprB160:
   case VgprB256:
   case VgprB512:
+  case VgprBRC:
+  case Vgpr32AExt:
   case Vgpr32SExt:
   case Vgpr32ZExt:
     return VgprRB;
@@ -1003,7 +1304,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   }
 }
 
-void RegBankLegalizeHelper::applyMappingDst(
+bool RegBankLegalizeHelper::applyMappingDst(
     MachineInstr &MI, unsigned &OpIdx,
     const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
   // Defs start from operand 0
@@ -1022,10 +1323,12 @@ void RegBankLegalizeHelper::applyMappingDst(
     case Sgpr32:
     case Sgpr64:
     case Sgpr128:
+    case SgprP0:
     case SgprP1:
     case SgprP3:
     case SgprP4:
     case SgprP5:
+    case SgprP8:
     case SgprV2S16:
     case SgprV2S32:
     case SgprV4S32:
@@ -1035,11 +1338,14 @@ void RegBankLegalizeHelper::applyMappingDst(
     case Vgpr128:
     case VgprP0:
     case VgprP1:
+    case VgprP2:
     case VgprP3:
     case VgprP4:
     case VgprP5:
     case VgprV2S16:
     case VgprV2S32:
+    case VgprV2S64:
+    case VgprV3S32:
     case VgprV4S32: {
       assert(Ty == getTyFromID(MethodIDs[OpIdx]));
       assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
@@ -1052,6 +1358,7 @@ void RegBankLegalizeHelper::applyMappingDst(
     case SgprB128:
     case SgprB256:
     case SgprB512:
+    case SgprBRC:
     case SgprPtr32:
     case SgprPtr64:
     case SgprPtr128:
@@ -1059,8 +1366,10 @@ void RegBankLegalizeHelper::applyMappingDst(
     case VgprB64:
     case VgprB96:
     case VgprB128:
+    case VgprB160:
     case VgprB256:
     case VgprB512:
+    case VgprBRC:
     case VgprPtr32:
     case VgprPtr64:
     case VgprPtr128: {
@@ -1074,9 +1383,11 @@ void RegBankLegalizeHelper::applyMappingDst(
       assert(RB == SgprRB);
       Register NewDst = MRI.createVirtualRegister(VccRB_S1);
       Op.setReg(NewDst);
-      auto CopyS32_Vcc =
-          B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
-      B.buildTrunc(Reg, CopyS32_Vcc);
+      if (!MRI.use_empty(Reg)) {
+        auto CopyS32_Vcc =
+            B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
+        B.buildTrunc(Reg, CopyS32_Vcc);
+      }
       break;
     }
     case UniInVgprS16: {
@@ -1092,8 +1403,11 @@ void RegBankLegalizeHelper::applyMappingDst(
       break;
     }
     case UniInVgprS32:
+    case UniInVgprS64:
     case UniInVgprV2S16:
-    case UniInVgprV4S32: {
+    case UniInVgprV2S32:
+    case UniInVgprV4S32:
+    case UniInVgprV2S64: {
       assert(Ty == getTyFromID(MethodIDs[OpIdx]));
       assert(RB == SgprRB);
       Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
@@ -1105,6 +1419,7 @@ void RegBankLegalizeHelper::applyMappingDst(
     case UniInVgprB64:
     case UniInVgprB96:
     case UniInVgprB128:
+    case UniInVgprB160:
     case UniInVgprB256:
     case UniInVgprB512: {
       assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
@@ -1120,20 +1435,28 @@ void RegBankLegalizeHelper::applyMappingDst(
       assert(RB == SgprRB);
       Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
       Op.setReg(NewDst);
-      B.buildTrunc(Reg, NewDst);
+      if (!MRI.use_empty(Reg))
+        B.buildTrunc(Reg, NewDst);
       break;
     }
     case InvalidMapping: {
-      LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
-      llvm_unreachable("missing fast rule for MI");
+      reportGISelFailure(
+          MF, MORE, "amdgpu-regbanklegalize",
+          "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
+      return false;
     }
     default:
-      llvm_unreachable("ID not supported");
+      reportGISelFailure(
+          MF, MORE, "amdgpu-regbanklegalize",
+          "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
+      return false;
     }
   }
+
+  return true;
 }
 
-void RegBankLegalizeHelper::applyMappingSrc(
+bool RegBankLegalizeHelper::applyMappingSrc(
     MachineInstr &MI, unsigned &OpIdx,
     const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
     SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
@@ -1163,10 +1486,12 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case Sgpr32:
     case Sgpr64:
     case Sgpr128:
+    case SgprP0:
     case SgprP1:
     case SgprP3:
     case SgprP4:
     case SgprP5:
+    case SgprP8:
     case SgprV2S16:
     case SgprV2S32:
     case SgprV4S32: {
@@ -1181,6 +1506,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case SgprB128:
     case SgprB256:
     case SgprB512:
+    case SgprBRC:
     case SgprPtr32:
     case SgprPtr64:
     case SgprPtr128: {
@@ -1195,11 +1521,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case Vgpr128:
     case VgprP0:
     case VgprP1:
+    case VgprP2:
     case VgprP3:
     case VgprP4:
     case VgprP5:
     case VgprV2S16:
     case VgprV2S32:
+    case VgprV2S64:
+    case VgprV3S32:
     case VgprV4S32: {
       assert(Ty == getTyFromID(MethodIDs[i]));
       if (RB != VgprRB) {
@@ -1213,8 +1542,10 @@ void RegBankLegalizeHelper::applyMappingSrc(
     case VgprB64:
     case VgprB96:
     case VgprB128:
+    case VgprB160:
     case VgprB256:
     case VgprB512:
+    case VgprBRC:
     case VgprPtr32:
     case VgprPtr64:
     case VgprPtr128: {
@@ -1268,6 +1599,13 @@ void RegBankLegalizeHelper::applyMappingSrc(
       Op.setReg(Zext.getReg(0));
       break;
     }
+    case Vgpr32AExt: {
+      assert(Ty.getSizeInBits() < 32);
+      assert(RB == VgprRB);
+      auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
+      Op.setReg(Aext.getReg(0));
+      break;
+    }
     case Vgpr32SExt: {
       // Note this ext allows S1, and it is meant to be combined away.
       assert(Ty.getSizeInBits() < 32);
@@ -1285,12 +1623,16 @@ void RegBankLegalizeHelper::applyMappingSrc(
       break;
     }
     default:
-      llvm_unreachable("ID not supported");
+      reportGISelFailure(
+          MF, MORE, "amdgpu-regbanklegalize",
+          "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
+      return false;
     }
   }
+  return true;
 }
 
-void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
+bool RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(Dst);
 
@@ -1313,16 +1655,17 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
       MI.getOperand(i).setReg(NewUse.getReg(0));
     }
 
-    return;
+    return true;
   }
 
-  // ALL divergent i1 phis should be already lowered and inst-selected into PHI
-  // with sgpr reg class and S1 LLT.
+  // ALL divergent i1 phis should have been lowered and inst-selected into PHI
+  // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
   // Note: this includes divergent phis that don't require lowering.
   if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
-    LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
-    llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
-                     "before RegBankLegalize to lower lane mask(vcc) phis");
+    reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                       "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
+                       MI);
+    return false;
   }
 
   // We accept all types that can fit in some register class.
@@ -1330,11 +1673,13 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
   // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
   if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
       Ty == LLT::pointer(4, 64)) {
-    return;
+    return true;
   }
 
-  LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
-  llvm_unreachable("type not supported");
+  reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
+                     "AMDGPU RegBankLegalize: type not supported for G_PHI",
+                     MI);
+  return false;
 }
 
 [[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index d937815..86669ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -12,6 +12,7 @@
 #include "AMDGPURegBankLegalizeRules.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
@@ -27,11 +28,13 @@ namespace AMDGPU {
 // to replace instruction. In other case InstApplyMethod will create new
 // instruction(s).
 class RegBankLegalizeHelper {
+  MachineFunction &MF;
   const GCNSubtarget &ST;
   MachineIRBuilder &B;
   MachineRegisterInfo &MRI;
   const MachineUniformityInfo &MUI;
   const RegisterBankInfo &RBI;
+  MachineOptimizationRemarkEmitter MORE;
   const RegBankLegalizeRules &RBLRules;
   const bool IsWave32;
   const RegisterBank *SgprRB;
@@ -72,6 +75,7 @@ class RegBankLegalizeHelper {
   static constexpr LLT P6 = LLT::pointer(6, 32);
 
   MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
+  MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
   MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
   MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
 
@@ -80,10 +84,10 @@ public:
                         const RegisterBankInfo &RBI,
                         const RegBankLegalizeRules &RBLRules);
 
-  void findRuleAndApplyMapping(MachineInstr &MI);
+  bool findRuleAndApplyMapping(MachineInstr &MI);
 
   // Manual apply helpers.
-  void applyMappingPHI(MachineInstr &MI);
+  bool applyMappingPHI(MachineInstr &MI);
   void applyMappingTrivial(MachineInstr &MI);
 
 private:
@@ -96,34 +100,39 @@ private:
 
   const RegisterBank *getRegBankFromID(RegBankLLTMappingApplyID ID);
 
-  void
+  bool
   applyMappingDst(MachineInstr &MI, unsigned &OpIdx,
                   const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs);
 
-  void
+  bool
   applyMappingSrc(MachineInstr &MI, unsigned &OpIdx,
                   const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
                   SmallSet<Register, 4> &SgprWaterfallOperandRegs);
 
-  void splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
+  bool splitLoad(MachineInstr &MI, ArrayRef<LLT> LLTBreakdown,
                  LLT MergeTy = LLT());
-  void widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
-  void widenMMOToS32(GAnyLoad &MI) const;
+  bool widenLoad(MachineInstr &MI, LLT WideTy, LLT MergeTy = LLT());
+  bool widenMMOToS32(GAnyLoad &MI) const;
 
-  void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
+  bool lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
              SmallSet<Register, 4> &SgprWaterfallOperandRegs);
 
-  void lowerVccExtToSel(MachineInstr &MI);
+  bool lowerVccExtToSel(MachineInstr &MI);
   std::pair<Register, Register> unpackZExt(Register Reg);
   std::pair<Register, Register> unpackSExt(Register Reg);
   std::pair<Register, Register> unpackAExt(Register Reg);
-  void lowerUnpackBitShift(MachineInstr &MI);
-  void lowerV_BFE(MachineInstr &MI);
-  void lowerS_BFE(MachineInstr &MI);
-  void lowerSplitTo32(MachineInstr &MI);
-  void lowerSplitTo32Select(MachineInstr &MI);
-  void lowerSplitTo32SExtInReg(MachineInstr &MI);
-  void lowerUnpackMinMax(MachineInstr &MI);
+  std::pair<Register, Register> unpackAExtTruncS16(Register Reg);
+  bool lowerUnpackBitShift(MachineInstr &MI);
+  bool lowerV_BFE(MachineInstr &MI);
+  bool lowerS_BFE(MachineInstr &MI);
+  bool lowerUniMAD64(MachineInstr &MI);
+  bool lowerSplitTo32(MachineInstr &MI);
+  bool lowerSplitTo32Mul(MachineInstr &MI);
+  bool lowerSplitTo16(MachineInstr &MI);
+  bool lowerSplitTo32Select(MachineInstr &MI);
+  bool lowerSplitTo32SExtInReg(MachineInstr &MI);
+  bool lowerUnpackMinMax(MachineInstr &MI);
+  bool lowerUnpackAExt(MachineInstr &MI);
 };
 
 } // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a67b12a..a0be07d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -60,20 +60,28 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(0, 64);
   case P1:
     return MRI.getType(Reg) == LLT::pointer(1, 64);
+  case P2:
+    return MRI.getType(Reg) == LLT::pointer(2, 32);
   case P3:
     return MRI.getType(Reg) == LLT::pointer(3, 32);
   case P4:
     return MRI.getType(Reg) == LLT::pointer(4, 64);
   case P5:
     return MRI.getType(Reg) == LLT::pointer(5, 32);
+  case P8:
+    return MRI.getType(Reg) == LLT::pointer(8, 128);
   case Ptr32:
     return isAnyPtr(MRI.getType(Reg), 32);
   case Ptr64:
     return isAnyPtr(MRI.getType(Reg), 64);
   case Ptr128:
     return isAnyPtr(MRI.getType(Reg), 128);
+  case V2S16:
+    return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
   case V2S32:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
+  case V3S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
   case V4S32:
     return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
   case B32:
@@ -84,6 +92,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg).getSizeInBits() == 96;
   case B128:
     return MRI.getType(Reg).getSizeInBits() == 128;
+  case B160:
+    return MRI.getType(Reg).getSizeInBits() == 160;
   case B256:
     return MRI.getType(Reg).getSizeInBits() == 256;
   case B512:
@@ -102,12 +112,16 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
   case UniP1:
     return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
+  case UniP2:
+    return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniform(Reg);
   case UniP3:
     return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg);
   case UniP4:
     return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
   case UniP5:
     return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+  case UniP8:
+    return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg);
   case UniPtr32:
     return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
   case UniPtr64:
@@ -116,6 +130,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
   case UniV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
+  case UniV2S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
   case UniB32:
     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
   case UniB64:
@@ -124,10 +140,23 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg);
   case UniB128:
     return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg);
+  case UniB160:
+    return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniform(Reg);
   case UniB256:
     return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
   case UniB512:
     return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
+  case UniBRC: {
+    if (!MUI.isUniform(Reg))
+      return false;
+    // Check if there is SGPR register class of same size as the LLT.
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+    // There is no 16 bit SGPR register class. Extra size check is required
+    // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
+    unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
+    return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
+  }
   case DivS1:
     return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
   case DivS16:
@@ -142,6 +171,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
   case DivP1:
     return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
+  case DivP2:
+    return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergent(Reg);
   case DivP3:
     return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg);
   case DivP4:
@@ -156,6 +187,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
   case DivV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
+  case DivV2S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
   case DivB32:
     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
   case DivB64:
@@ -164,10 +197,20 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg);
   case DivB128:
     return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg);
+  case DivB160:
+    return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergent(Reg);
   case DivB256:
     return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
   case DivB512:
     return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
+  case DivBRC: {
+    if (!MUI.isDivergent(Reg))
+      return false;
+    // Check if there is VGPR register class of same size as the LLT.
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+    return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
+  }
   case _:
     return true;
   default:
@@ -202,7 +245,7 @@ bool PredicateMapping::match(const MachineInstr &MI,
   return true;
 }
 
-SetOfRulesForOpcode::SetOfRulesForOpcode() {}
+SetOfRulesForOpcode::SetOfRulesForOpcode() = default;
 
 SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes)
     : FastTypes(FastTypes) {}
@@ -234,12 +277,13 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
     return B64;
   if (Ty == LLT::fixed_vector(3, 32))
     return B96;
-  if (Ty == LLT::fixed_vector(4, 32) || isAnyPtr(Ty, 128))
+  if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
+      Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
     return B128;
   return _;
 }
 
-const RegBankLLTMapping &
+const RegBankLLTMapping *
 SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
                                       const MachineRegisterInfo &MRI,
                                       const MachineUniformityInfo &MUI) const {
@@ -256,17 +300,16 @@ SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI,
       Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
 
     if (Slot != -1)
-      return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot];
+      return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot];
   }
 
   // Slow search for more complex rules.
   for (const RegBankLegalizeRule &Rule : Rules) {
     if (Rule.Predicate.match(MI, MUI, MRI))
-      return Rule.OperandMapping;
+      return &Rule.OperandMapping;
   }
 
-  LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
-  llvm_unreachable("None of the rules defined for MI's opcode matched MI");
+  return nullptr;
 }
 
 void SetOfRulesForOpcode::addRule(RegBankLegalizeRule Rule) {
@@ -277,14 +320,14 @@ void SetOfRulesForOpcode::addFastRuleDivergent(UniformityLLTOpPredicateID Ty,
                                                RegBankLLTMapping RuleApplyIDs) {
   int Slot = getFastPredicateSlot(Ty);
   assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
-  Div[Slot] = RuleApplyIDs;
+  Div[Slot] = std::move(RuleApplyIDs);
 }
 
 void SetOfRulesForOpcode::addFastRuleUniform(UniformityLLTOpPredicateID Ty,
                                              RegBankLLTMapping RuleApplyIDs) {
   int Slot = getFastPredicateSlot(Ty);
   assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
-  Uni[Slot] = RuleApplyIDs;
+  Uni[Slot] = std::move(RuleApplyIDs);
 }
 
 int SetOfRulesForOpcode::getFastPredicateSlot(
@@ -349,7 +392,7 @@ RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
   return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
 }
 
-const SetOfRulesForOpcode &
+const SetOfRulesForOpcode *
 RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
   if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
@@ -357,19 +400,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
       Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
     unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
     auto IRAIt = IRulesAlias.find(IntrID);
-    if (IRAIt == IRulesAlias.end()) {
-      LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
-      llvm_unreachable("No rules defined for intrinsic opcode");
-    }
-    return IRules.at(IRAIt->second);
+    if (IRAIt == IRulesAlias.end())
+      return nullptr;
+    return &IRules.at(IRAIt->second);
   }
 
   auto GRAIt = GRulesAlias.find(Opc);
-  if (GRAIt == GRulesAlias.end()) {
-    LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
-    llvm_unreachable("No rules defined for generic opcode");
-  }
-  return GRules.at(GRAIt->second);
+  if (GRAIt == GRulesAlias.end())
+    return nullptr;
+  return &GRules.at(GRAIt->second);
 }
 
 // Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
@@ -470,9 +509,54 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
-      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackAExt})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+      .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
+
+  addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
+      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
+      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
+
+  addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
+      .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}})
+      .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
+
+  addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
+  bool HasVecMulU64 = ST->hasVectorMulU64();
+  addRulesForGOpcs({G_MUL}, Standard)
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+      .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
+      .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
+      .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
+
+  bool hasMulHi = ST->hasScalarMulHiInsts();
+  addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
+
+  addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
+      .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
+      .Uni(S64, {{Sgpr64, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr64}, UniMAD64});
 
-  addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+  bool HasScalarSMulU64 = ST->hasScalarSMulU64();
+  addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
+      .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
 
   addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
       .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
@@ -514,6 +598,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
       .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
 
+  addRulesForGOpcs({G_FSHR}, Standard)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
   addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
 
   addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
@@ -538,21 +626,56 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
       .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
 
-  // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
-  // and G_FREEZE here, rest is trivially regbankselected earlier
+  // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT and G_FCONSTANT
+  // here, rest is trivially regbankselected earlier
   addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
   addRulesForGOpcs({G_CONSTANT})
       .Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
-  addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
 
-  addRulesForGOpcs({G_ICMP})
-      .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
-      .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
-      .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
+  addRulesForGOpcs({G_FREEZE})
+      .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
+      .Any({{DivS1}, {{Vcc}, {Vcc}}})
+      .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
+      .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
+      .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
+
+  addRulesForGOpcs({G_UNMERGE_VALUES})
+      .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
+      .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
+      .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
+
+  Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
+    auto Pred =
+        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+    return CmpInst::isSigned(Pred);
+  });
+
+  Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
+    auto Pred =
+        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+    return ICmpInst::isEquality(Pred);
+  });
 
-  addRulesForGOpcs({G_FCMP})
-      .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
-      .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+  bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
+  // clang-format off
+  addRulesForGOpcs({G_ICMP})
+      .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
+      .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
+      .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
+      .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
+      .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
+      .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+      .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
+      .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
+      .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
+      .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
+      .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
+      .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
+      .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
+      .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
+      .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
+      .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
+  // clang-format on
 
   addRulesForGOpcs({G_BRCOND})
       .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
@@ -580,6 +703,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
       .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
 
+  bool Has16bitCmp = ST->has16BitInsts();
+
   // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
   // It is up to user to deal with truncated bits.
   addRulesForGOpcs({G_TRUNC})
@@ -593,7 +718,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
       .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
       // This is non-trivial. VgprToVccCopy is done using compare instruction.
-      .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
+      .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
+      .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr32AExt}, VgprToVccCopy}},
+           !Has16bitCmp)
       .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
       .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
 
@@ -639,6 +766,64 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
       .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
 
+  // Atomic read-modify-write operations: result and value are always VGPR,
+  // pointer varies by address space.
+  addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
+                    G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
+                    G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
+                    G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
+                    G_ATOMICRMW_UDEC_WRAP})
+      .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
+      .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
+      .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
+      .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
+      .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
+      .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
+
+  bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
+  bool HasAtomicBufferGlobalPkAddF16Insts =
+      ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
+      ST->hasAtomicBufferGlobalPkAddF16Insts();
+  bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
+  addRulesForGOpcs({G_ATOMICRMW_FADD})
+      .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
+      .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
+      .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
+      .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
+      .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
+      .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
+      .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
+           HasAtomicFlatPkAdd16Insts)
+      .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
+           HasAtomicBufferGlobalPkAddF16Insts)
+      .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
+           HasAtomicDsPkAdd16Insts);
+
+  addRulesForGOpcs({G_ATOMIC_CMPXCHG})
+      .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
+      .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
+      .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
+      .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
+
+  addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
+      .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
+      .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
+      .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
+      .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
+
+  addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
+      .Div(S32, {{Vgpr32},
+                 {Vgpr32, Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Div(S64, {{Vgpr64},
+                 {Vgpr64, Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+  addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
+                    G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_SMAX,
+                    G_AMDGPU_BUFFER_ATOMIC_SMIN},
+                   Standard)
+      .Div(S32, {{Vgpr32}, {Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
   bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
   bool hasSMRDSmall = ST->hasScalarSubwordLoads();
   bool usesTrue16 = ST->useRealTrue16Insts();
@@ -860,6 +1045,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
       .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
       .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
+
   // clang-format on
 
   addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
@@ -874,8 +1060,49 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
       .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
 
-  addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
-      .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
+  addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
+                    G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
+                   StandardB)
+      .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+  addRulesForGOpcs(
+      {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
+      StandardB)
+      .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+  addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
+                   StandardB)
+      .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Div(B96, {{VgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B96, {{UniInVgprB96}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Any({{DivB160}, {{VgprB160}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+      .Any({{UniB160},
+            {{UniInVgprB160}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
+
+  addRulesForGOpcs(
+      {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
+      StandardB)
+      .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B64, {{UniInVgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Div(B128, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
+      .Uni(B128, {{UniInVgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
+
+  addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
+                    G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
+                    G_AMDGPU_BUFFER_STORE_FORMAT_D16,
+                    G_AMDGPU_TBUFFER_STORE_FORMAT,
+                    G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
+      .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+      .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+      .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
+      .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
 
   addRulesForGOpcs({G_PTR_ADD})
       .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
@@ -899,34 +1126,237 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
       .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
 
+  // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
+  // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
+  addRulesForGOpcs({G_PTRMASK})
+      .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
+      .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
+      .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
+      .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
+
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
-  addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+  addRulesForGOpcs({G_BITREVERSE}, Standard)
+      .Uni(S32, {{Sgpr32}, {Sgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32}})
+      .Uni(S64, {{Sgpr64}, {Sgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64}});
+
+  addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
+  addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
+      .Uni(S64, {{Sgpr64}, {}});
+
+  addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
+
+  addRulesForGOpcs({G_GLOBAL_VALUE})
+      .Any({{UniP0}, {{SgprP0}, {}}})
+      .Any({{UniP1}, {{SgprP1}, {}}})
+      .Any({{UniP3}, {{SgprP3}, {}}})
+      .Any({{UniP4}, {{SgprP4}, {}}})
+      .Any({{UniP8}, {{SgprP8}, {}}});
+
+  addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
 
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
-  addRulesForGOpcs({G_FADD}, Standard)
+  addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+      .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
       .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
       .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
-      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
+           hasSALUFloat)
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
 
-  addRulesForGOpcs({G_FPTOUI})
-      .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
-      .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
+  addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
 
-  addRulesForGOpcs({G_UITOFP})
+  addRulesForGOpcs({G_FMAD}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
+  addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+
+  addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+      .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
+      .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
+      .Uni(V2S16,
+           {{SgprV2S16}, {SgprV2S16, SgprV2S16, SgprV2S16}, ScalarizeToS16},
+           hasSALUFloat)
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}},
+           !hasSALUFloat);
+
+  addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
+  // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
+  // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
+  // instructions on SALU.
+  addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
+
+  // FNEG and FABS are either folded as source modifiers or can be selected as
+  // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
+  // targets without SALU float we still select them as VGPR since there would
+  // be no real sgpr use.
+  addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
+      .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
+      .Div(S16, {{Vgpr16}, {Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
+      .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
+      .Div(S32, {{Vgpr32}, {Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
+
+  addRulesForGOpcs({G_FCANONICALIZE}, Standard)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32}})
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
+      .Div(S16, {{Vgpr16}, {Vgpr16}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
+
+  bool hasPST = ST->hasPseudoScalarTrans();
+  addRulesForGOpcs({G_FSQRT}, Standard)
+      .Div(S16, {{Vgpr16}, {Vgpr16}})
+      .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
+
+  addRulesForGOpcs({G_FPTOUI, G_FPTOSI})
+      .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
+      .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
+      .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
+      .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
+      .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
+      .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
+      .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
       .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+      .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
+      .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
+
+  addRulesForGOpcs({G_UITOFP, G_SITOFP})
+      .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
+      .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
+      .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
+      .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
+      .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
-      .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
+      .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
+      .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+      .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
+      .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
+
+  addRulesForGOpcs({G_FPEXT})
+      .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
+      .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
+      .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
+      .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
+      .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
+
+  addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
+      .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
+      .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
+
+  addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
+
+  addRulesForGOpcs({G_FPTRUNC})
+      .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
+      .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
+      .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
+      .Any({{UniV2S16, V2S32}, {{UniInVgprV2S16}, {VgprV2S32}}})
+      .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
+      .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
+      .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
+
+  addRulesForGOpcs({G_IS_FPCLASS})
+      .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
+      .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
+      .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
+      .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
+      .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
+      .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
+
+  addRulesForGOpcs({G_FCMP}, Standard)
+      .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
+           hasSALUFloat)
+      .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
+           !hasSALUFloat)
+      .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
+      .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
+           hasSALUFloat)
+      .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
+           !hasSALUFloat)
+      .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+      .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
+      .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
+
+  addRulesForGOpcs({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUNDEVEN, G_FFLOOR, G_FCEIL,
+                    G_FEXP2, G_FLOG2},
+                   Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
+      .Div(S16, {{Vgpr16}, {Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
+      .Div(S32, {{Vgpr32}, {Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64}});
 
   using namespace Intrinsic;
 
   addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
 
+  addRulesForIOpcs({amdgcn_groupstaticsize}).Any({{S32}, {{Sgpr32}, {IntrId}}});
+
   // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
-  addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});
+  addRulesForIOpcs({amdgcn_end_cf})
+      .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
+      .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
 
   addRulesForIOpcs({amdgcn_if_break}, Standard)
+      .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
       .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
 
   addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
@@ -938,4 +1368,68 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       // readfirstlaning just in case register is not in sgpr.
       .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
 
+  addRulesForIOpcs({amdgcn_s_sleep}).Any({{_, _}, {{}, {IntrId, Imm}}});
+
+  addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
+      .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
+      .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
+
+  addRulesForIOpcs({amdgcn_mulhi_u24, amdgcn_mulhi_i24, amdgcn_fmul_legacy},
+                   Standard)
+      .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
+
+  addRulesForIOpcs({amdgcn_fma_legacy}, Standard)
+      .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
+
+  addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
+      .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
+      .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
+      .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
+
+  addRulesForIOpcs({amdgcn_prng_b32})
+      .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
+      .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
+
+  addRulesForIOpcs({amdgcn_sffbh}, Standard)
+      .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
+      .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
+
+  addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
+      .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
+      .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
+      .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
+      .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
+
+  addRulesForIOpcs({amdgcn_global_load_tr_b64})
+      .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
+      .Any({{DivB32}, {{VgprB32}, {IntrId, SgprP1}}});
+
+  addRulesForIOpcs({amdgcn_global_load_tr_b128})
+      .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
+      .Any({{DivB128}, {{VgprB128}, {IntrId, SgprP1}}});
+
+  addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
+      .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
+
+  addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm}, StandardB)
+      .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
+      .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
+      .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
+      .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
+      .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
+      .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
+      .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
+      .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
+      .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
+      .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
+      .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
+      .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
+
 } // end initialize rules
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 93e0efd..eee4f62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -60,24 +60,29 @@ enum UniformityLLTOpPredicateID {
   // pointers
   P0,
   P1,
+  P2,
   P3,
   P4,
   P5,
+  P8,
   Ptr32,
   Ptr64,
   Ptr128,
 
   UniP0,
   UniP1,
+  UniP2,
   UniP3,
   UniP4,
   UniP5,
+  UniP8,
   UniPtr32,
   UniPtr64,
   UniPtr128,
 
   DivP0,
   DivP1,
+  DivP2,
   DivP3,
   DivP4,
   DivP5,
@@ -88,18 +93,24 @@ enum UniformityLLTOpPredicateID {
   // vectors
   V2S16,
   V2S32,
+  V2S64,
   V3S32,
   V4S32,
 
   UniV2S16,
+  UniV2S32,
+  UniV2S64,
 
   DivV2S16,
+  DivV2S32,
+  DivV2S64,
 
   // B types
   B32,
   B64,
   B96,
   B128,
+  B160,
   B256,
   B512,
 
@@ -107,15 +118,19 @@ enum UniformityLLTOpPredicateID {
   UniB64,
   UniB96,
   UniB128,
+  UniB160,
   UniB256,
   UniB512,
+  UniBRC,
 
   DivB32,
   DivB64,
   DivB96,
   DivB128,
+  DivB160,
   DivB256,
   DivB512,
+  DivBRC
 };
 
 // How to apply register bank on register operand.
@@ -134,10 +149,13 @@ enum RegBankLLTMappingApplyID {
   Sgpr32,
   Sgpr64,
   Sgpr128,
+  SgprP0,
   SgprP1,
+  SgprP2,
   SgprP3,
   SgprP4,
   SgprP5,
+  SgprP8,
   SgprPtr32,
   SgprPtr64,
   SgprPtr128,
@@ -150,6 +168,7 @@ enum RegBankLLTMappingApplyID {
   SgprB128,
   SgprB256,
   SgprB512,
+  SgprBRC,
 
   // vgpr scalars, pointers, vectors and B-types
   Vgpr16,
@@ -158,6 +177,7 @@ enum RegBankLLTMappingApplyID {
   Vgpr128,
   VgprP0,
   VgprP1,
+  VgprP2,
   VgprP3,
   VgprP4,
   VgprP5,
@@ -166,24 +186,32 @@ enum RegBankLLTMappingApplyID {
   VgprPtr128,
   VgprV2S16,
   VgprV2S32,
+  VgprV3S32,
   VgprB32,
   VgprB64,
   VgprB96,
   VgprB128,
+  VgprB160,
   VgprB256,
   VgprB512,
+  VgprBRC,
   VgprV4S32,
+  VgprV2S64,
 
   // Dst only modifiers: read-any-lane and truncs
   UniInVcc,
   UniInVgprS16,
   UniInVgprS32,
+  UniInVgprS64,
   UniInVgprV2S16,
+  UniInVgprV2S32,
   UniInVgprV4S32,
+  UniInVgprV2S64,
   UniInVgprB32,
   UniInVgprB64,
   UniInVgprB96,
   UniInVgprB128,
+  UniInVgprB160,
   UniInVgprB256,
   UniInVgprB512,
 
@@ -198,6 +226,7 @@ enum RegBankLLTMappingApplyID {
   Sgpr32AExtBoolInReg,
   Sgpr32SExt,
   Sgpr32ZExt,
+  Vgpr32AExt,
   Vgpr32SExt,
   Vgpr32ZExt,
 };
@@ -216,14 +245,23 @@ enum LoweringMethodID {
   S_BFE,
   V_BFE,
   VgprToVccCopy,
+  UniMAD64,
+  UniMul64,
+  DivSMulToMAD,
   SplitTo32,
+  SplitTo32Mul,
+  ScalarizeToS16,
   SplitTo32Select,
   SplitTo32SExtInReg,
   Ext32To64,
   UniCstExt,
   SplitLoad,
   WidenLoad,
-  WidenMMOToS32
+  WidenMMOToS32,
+  UnpackAExt,
+  VerifyAllSgpr,
+  ApplyAllVgpr,
+  UnmergeToShiftTrunc
 };
 
 enum FastRulesTypes {
@@ -277,7 +315,7 @@ public:
   SetOfRulesForOpcode();
   SetOfRulesForOpcode(FastRulesTypes FastTypes);
 
-  const RegBankLLTMapping &
+  const RegBankLLTMapping *
   findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI,
                    const MachineUniformityInfo &MUI) const;
 
@@ -297,7 +335,7 @@ private:
 class RegBankLegalizeRules {
   const GCNSubtarget *ST;
   MachineRegisterInfo *MRI;
-  // Separate maps for G-opcodes and instrinsics since they are in different
+  // Separate maps for G-opcodes and intrinsics since they are in different
   // enums. Multiple opcodes can share same set of rules.
   // RulesAlias = map<Opcode, KeyOpcode>
   // Rules = map<KeyOpcode, SetOfRulesForOpcode>
@@ -375,7 +413,7 @@ public:
     MRI = &_MRI;
   };
 
-  const SetOfRulesForOpcode &getRulesForOpc(MachineInstr &MI) const;
+  const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const;
 };
 
 } // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a4..e8f316d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -287,9 +287,6 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost(
 const RegisterBank &
 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
                                                LLT Ty) const {
-  if (&RC == &AMDGPU::SReg_1RegClass)
-    return AMDGPU::VCCRegBank;
-
   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
   // VCC-like use.
   if (TRI->isSGPRClass(&RC)) {
@@ -471,7 +468,7 @@ RegisterBankInfo::InstructionMappings
 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     const MachineInstr &MI) const {
 
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
 
@@ -1263,11 +1260,14 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets(
     }
   }
 
+  const bool CheckNUW = Subtarget.hasGFX1250Insts();
   Register Base;
   unsigned Offset;
 
   std::tie(Base, Offset) =
-      AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
+      AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset,
+                                        /*KnownBits=*/nullptr,
+                                        /*CheckNUW=*/CheckNUW);
 
   uint32_t SOffset, ImmOffset;
   if ((int)Offset > 0 &&
@@ -1292,7 +1292,8 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets(
 
   // Handle the variable sgpr + vgpr case.
   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
-  if (Add && (int)Offset >= 0) {
+  if (Add && (int)Offset >= 0 &&
+      (!CheckNUW || Add->getFlag(MachineInstr::NoUWrap))) {
     Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
     Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
 
@@ -1561,8 +1562,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
 
   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
-  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
-    llvm_unreachable("failed to constrain BFE");
+  constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this);
 
   MI.eraseFromParent();
   return true;
@@ -1873,11 +1873,11 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
   B.buildInstr(AMDGPU::V_MOV_B32_e32)
-    .addDef(TmpReg0)
-    .addUse(SrcReg, 0, AMDGPU::sub0);
+      .addDef(TmpReg0)
+      .addUse(SrcReg, {}, AMDGPU::sub0);
   B.buildInstr(AMDGPU::V_MOV_B32_e32)
-    .addDef(TmpReg1)
-    .addUse(SrcReg, 0, AMDGPU::sub1);
+      .addDef(TmpReg1)
+      .addUse(SrcReg, {}, AMDGPU::sub1);
   B.buildInstr(AMDGPU::REG_SEQUENCE)
     .addDef(DstReg)
     .addUse(TmpReg0)
@@ -2412,7 +2412,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       if (DstBank == &AMDGPU::VCCRegBank)
         break;
 
-      MachineFunction *MF = MI.getParent()->getParent();
+      MachineFunction *MF = MI.getMF();
       ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
       LegalizerHelper Helper(*MF, ApplyBank, B);
 
@@ -2492,7 +2492,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     // There is no VALU abs instruction so we need to replace it with a sub and
     // max combination.
     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
-      MachineFunction *MF = MI.getParent()->getParent();
+      MachineFunction *MF = MI.getMF();
       ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
       LegalizerHelper Helper(*MF, Apply, B);
 
@@ -3114,6 +3114,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
@@ -3283,6 +3285,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 2); // M0
       return;
     }
+    case Intrinsic::amdgcn_s_alloc_vgpr:
+      constrainOpWithReadfirstlane(B, MI, 2);
+      return;
     case Intrinsic::amdgcn_s_sendmsg:
     case Intrinsic::amdgcn_s_sendmsghalt: {
       // FIXME: Should this use a waterfall loop?
@@ -3297,7 +3302,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 1); // M0
       return;
     case Intrinsic::amdgcn_raw_buffer_load_lds:
-    case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
+    case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
       constrainOpWithReadfirstlane(B, MI, 2); // M0
@@ -3305,7 +3312,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       return;
     }
     case Intrinsic::amdgcn_struct_buffer_load_lds:
-    case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+    case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 1); // rsrc
       constrainOpWithReadfirstlane(B, MI, 2); // M0
@@ -3321,7 +3330,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       return;
     }
     case Intrinsic::amdgcn_load_to_lds:
-    case Intrinsic::amdgcn_global_load_lds: {
+    case Intrinsic::amdgcn_load_async_to_lds:
+    case Intrinsic::amdgcn_global_load_lds:
+    case Intrinsic::amdgcn_global_load_async_lds: {
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 2);
       return;
@@ -3348,6 +3359,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 1);
       return;
     case Intrinsic::amdgcn_s_barrier_join:
+    case Intrinsic::amdgcn_s_wakeup_barrier:
       constrainOpWithReadfirstlane(B, MI, 1);
       return;
     case Intrinsic::amdgcn_s_barrier_init:
@@ -3496,6 +3508,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
     break;
   }
+  case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
+  case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR:
   case AMDGPU::G_LOAD:
   case AMDGPU::G_ZEXTLOAD:
   case AMDGPU::G_SEXTLOAD: {
@@ -3607,7 +3621,7 @@ unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
 }
 
 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
@@ -3623,7 +3637,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
 
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
 
@@ -3641,7 +3655,7 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
 
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
 
@@ -3665,7 +3679,7 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
 
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
 
@@ -3744,7 +3758,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
 
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -3834,7 +3848,7 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
 //
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
@@ -4084,6 +4098,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case AMDGPU::G_FPTOSI:
   case AMDGPU::G_FPTOUI:
+  case AMDGPU::G_FPTOSI_SAT:
+  case AMDGPU::G_FPTOUI_SAT:
   case AMDGPU::G_SITOFP:
   case AMDGPU::G_UITOFP: {
     unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -4502,6 +4518,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
@@ -4577,6 +4595,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
     break;
+  case AMDGPU::G_AMDGPU_SPONENTRY: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    break;
+  }
   case AMDGPU::G_INTRINSIC:
   case AMDGPU::G_INTRINSIC_CONVERGENT: {
     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
@@ -4835,6 +4858,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_perm_pk16_b4_u4:
     case Intrinsic::amdgcn_perm_pk16_b6_u4:
     case Intrinsic::amdgcn_perm_pk16_b8_u4:
+    case Intrinsic::amdgcn_add_max_i32:
+    case Intrinsic::amdgcn_add_max_u32:
+    case Intrinsic::amdgcn_add_min_i32:
+    case Intrinsic::amdgcn_add_min_u32:
+    case Intrinsic::amdgcn_pk_add_max_i16:
+    case Intrinsic::amdgcn_pk_add_max_u16:
+    case Intrinsic::amdgcn_pk_add_min_i16:
+    case Intrinsic::amdgcn_pk_add_min_u16:
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_exp2:
@@ -5073,17 +5104,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       unsigned MinNumRegsRequired = DstSize / 32;
 
       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+      bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
       OpdsMapping[0] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
 
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
       OpdsMapping[4] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
 
       OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
       OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);
@@ -5209,11 +5240,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     case Intrinsic::amdgcn_wave_reduce_add:
+    case Intrinsic::amdgcn_wave_reduce_fadd:
     case Intrinsic::amdgcn_wave_reduce_sub:
+    case Intrinsic::amdgcn_wave_reduce_fsub:
     case Intrinsic::amdgcn_wave_reduce_min:
     case Intrinsic::amdgcn_wave_reduce_umin:
+    case Intrinsic::amdgcn_wave_reduce_fmin:
     case Intrinsic::amdgcn_wave_reduce_max:
     case Intrinsic::amdgcn_wave_reduce_umax:
+    case Intrinsic::amdgcn_wave_reduce_fmax:
     case Intrinsic::amdgcn_wave_reduce_and:
     case Intrinsic::amdgcn_wave_reduce_or:
     case Intrinsic::amdgcn_wave_reduce_xor: {
@@ -5225,11 +5260,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
       break;
     }
-    case Intrinsic::amdgcn_s_bitreplicate:
+    case Intrinsic::amdgcn_s_bitreplicate: {
       Register MaskReg = MI.getOperand(2).getReg();
       unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
+      break;
+    }
+    case Intrinsic::amdgcn_wave_shuffle: {
+      unsigned OpSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      break;
+    }
     }
     break;
   }
@@ -5296,12 +5340,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
-    case Intrinsic::amdgcn_global_atomic_csub:
     case Intrinsic::amdgcn_global_atomic_fmin_num:
     case Intrinsic::amdgcn_global_atomic_fmax_num:
     case Intrinsic::amdgcn_flat_atomic_fmin_num:
     case Intrinsic::amdgcn_flat_atomic_fmax_num:
-    case Intrinsic::amdgcn_atomic_cond_sub_u32:
     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
     case Intrinsic::amdgcn_global_load_tr_b64:
     case Intrinsic::amdgcn_global_load_tr_b128:
@@ -5311,12 +5353,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_ds_load_tr16_b128:
     case Intrinsic::amdgcn_ds_load_tr4_b64:
     case Intrinsic::amdgcn_ds_load_tr6_b96:
-    case Intrinsic::amdgcn_flat_load_monitor_b32:
-    case Intrinsic::amdgcn_flat_load_monitor_b64:
-    case Intrinsic::amdgcn_flat_load_monitor_b128:
-    case Intrinsic::amdgcn_global_load_monitor_b32:
-    case Intrinsic::amdgcn_global_load_monitor_b64:
-    case Intrinsic::amdgcn_global_load_monitor_b128:
     case Intrinsic::amdgcn_ds_read_tr4_b64:
     case Intrinsic::amdgcn_ds_read_tr6_b96:
     case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -5359,6 +5395,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
       break;
+    case Intrinsic::amdgcn_s_alloc_vgpr:
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+      break;
     case Intrinsic::amdgcn_s_sendmsg:
     case Intrinsic::amdgcn_s_sendmsghalt: {
       // This must be an SGPR, but accept a VGPR.
@@ -5418,7 +5458,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     case Intrinsic::amdgcn_raw_buffer_load_lds:
-    case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
+    case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
@@ -5451,7 +5493,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     case Intrinsic::amdgcn_struct_buffer_load_lds:
-    case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+    case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
@@ -5570,6 +5614,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       break;
     case Intrinsic::amdgcn_s_barrier_join:
+    case Intrinsic::amdgcn_s_wakeup_barrier:
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       break;
     case Intrinsic::amdgcn_s_barrier_init:
@@ -5696,6 +5741,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_ATOMICRMW_FMAX:
   case AMDGPU::G_ATOMICRMW_UINC_WRAP:
   case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+  case AMDGPU::G_ATOMICRMW_USUB_COND:
+  case AMDGPU::G_ATOMICRMW_USUB_SAT:
   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
@@ -5728,6 +5775,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
     break;
+  case AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR:
+  case AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
+    break;
+  }
   }
 
   return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 1c1a6da..c37d309 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -15,7 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
-def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
+def VCCRegBank : RegisterBank<"VCC", [SReg_32, SReg_64]>;
 
 def AGPRRegBank : RegisterBank <"AGPR",
   [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0ea9add..4e664e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -256,17 +256,13 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
         // Pseudo used just to encode the underlying global. Is there a better
         // way to track this?
 
+        // TODO: Some of the generic call-like pseudos do not encode the callee,
+        // so we overly conservatively treat this as an indirect call.
         const MachineOperand *CalleeOp =
             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
 
-        const Function *Callee = getCalleeFunction(*CalleeOp);
-
-        // Avoid crashing on undefined behavior with an illegal call to a
-        // kernel. If a callsite's calling convention doesn't match the
-        // function's, it's undefined behavior. If the callsite calling
-        // convention does match, that would have errored earlier.
-        if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
-          report_fatal_error("invalid call to entry function");
+        const Function *Callee =
+            CalleeOp ? getCalleeFunction(*CalleeOp) : nullptr;
 
         auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
           return F == &MF.getFunction();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 89c16da..7a5db42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/InitializePasses.h"
 
@@ -96,8 +97,8 @@ public:
 
   /// Compute the register class constraints based on the uses of \p Reg,
   /// excluding MFMA uses from which can be rewritten to change the register
-  /// class constraint. This should be nearly identical to
-  /// MachineRegisterInfo::recomputeRegClass.
+  /// class constraint. MFMA scale operands need to be constraint checked.
+  /// This should be nearly identical to MachineRegisterInfo::recomputeRegClass.
 
   /// \p RewriteCandidates will collect the set of MFMA instructions that need
   /// to have the opcode mutated to perform the replacement.
@@ -151,9 +152,16 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
 
       // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
       // effects of rewrite candidates. It just so happens that we can use
-      // either AGPR or VGPR in src0/src1, so don't bother checking the
-      // constraint effects of the individual operands.
+      // either AGPR or VGPR in src0/src1. We still need to check constraint
+      // effects for scale variant, which does not allow AGPR.
       if (isRewriteCandidate(*MI)) {
+        int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
+        const MCInstrDesc &AGPRDesc = TII.get(AGPROp);
+        const TargetRegisterClass *NewRC =
+            TII.getRegClass(AGPRDesc, MO.getOperandNo());
+        if (!TRI.hasAGPRs(NewRC))
+          return false;
+
         const MachineOperand *VDst =
             TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
         const MachineOperand *Src2 =
@@ -587,10 +595,7 @@ public:
   static char ID;
   RegisterClassInfo RegClassInfo;
 
-  AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) {
-    initializeAMDGPURewriteAGPRCopyMFMALegacyPass(
-        *PassRegistry::getPassRegistry());
-  }
+  AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -659,7 +664,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
   if (!Impl.run(MF))
     return PreservedAnalyses::all();
   auto PA = getMachineFunctionPassPreservedAnalyses();
-  PA.preserveSet<CFGAnalyses>();
-  PA.preserve<LiveStacksAnalysis>();
+  PA.preserveSet<CFGAnalyses>()
+      .preserve<LiveStacksAnalysis>()
+      .preserve<VirtRegMapAnalysis>()
+      .preserve<SlotIndexesAnalysis>()
+      .preserve<LiveIntervalsAnalysis>()
+      .preserve<LiveRegMatrixAnalysis>();
   return PA;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 4b1f80c..a2e16c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -299,7 +299,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   if (Replacements.empty())
     return false;
 
-  LLVMContext &Ctx = F.getParent()->getContext();
+  LLVMContext &Ctx = F.getContext();
   StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName());
 
   FunctionType *NewFuncTy = FunctionType::get(NewRetTy,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346..963bb91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
 def : SourceOfDivergence<int_r600_read_tidig_x>;
 def : SourceOfDivergence<int_r600_read_tidig_y>;
 def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
@@ -409,7 +407,17 @@ def : AlwaysUniform<int_amdgcn_cluster_workgroup_max_flat_id>;
 def : AlwaysUniform<int_amdgcn_workgroup_id_x>;
 def : AlwaysUniform<int_amdgcn_workgroup_id_y>;
 def : AlwaysUniform<int_amdgcn_workgroup_id_z>;
+def : AlwaysUniform<int_amdgcn_s_alloc_vgpr>;
 def : AlwaysUniform<int_amdgcn_s_getpc>;
 def : AlwaysUniform<int_amdgcn_s_getreg>;
 def : AlwaysUniform<int_amdgcn_s_memrealtime>;
 def : AlwaysUniform<int_amdgcn_s_memtime>;
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+  let FilterClass = "AMDGPUImageDMaskIntrinsic";
+  let Fields = ["Intr"];
+
+  let PrimaryKey = ["Intr"];
+  let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+  let PrimaryKeyEarlyOut = 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
index 2941a48..5b8ee5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp
@@ -7,13 +7,53 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSelectionDAGInfo.h"
-#include "AMDGPUISelLowering.h"
+
+#define GET_SDNODE_DESC
+#include "AMDGPUGenSDNodeInfo.inc"
 
 using namespace llvm;
 
+AMDGPUSelectionDAGInfo::AMDGPUSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(AMDGPUGenSDNodeInfo) {}
+
 AMDGPUSelectionDAGInfo::~AMDGPUSelectionDAGInfo() = default;
 
-bool AMDGPUSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= AMDGPUISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= AMDGPUISD::LAST_MEMORY_OPCODE;
+const char *AMDGPUSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+#define NODE_NAME_CASE(node)                                                   \
+  case AMDGPUISD::node:                                                        \
+    return "AMDGPUISD::" #node;
+
+  switch (static_cast<AMDGPUISD::NodeType>(Opcode)) {
+    // These nodes don't have corresponding entries in *.td files yet.
+    NODE_NAME_CASE(WAVE_ADDRESS)
+    NODE_NAME_CASE(MAD_I64_I32)
+    NODE_NAME_CASE(MAD_U64_U32)
+    NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+    // These do, but only when compiling R600.td,
+    // and the enum is generated from AMDGPU.td.
+    NODE_NAME_CASE(DOT4)
+    NODE_NAME_CASE(TEXTURE_FETCH)
+    NODE_NAME_CASE(R600_EXPORT)
+    NODE_NAME_CASE(CONST_ADDRESS)
+    NODE_NAME_CASE(DUMMY_CHAIN)
+  }
+
+#undef NODE_NAME_CASE
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
+}
+
+void AMDGPUSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
+                                              const SDNode *N) const {
+  switch (N->getOpcode()) {
+  case AMDGPUISD::IF:
+    // result #0 must have type i1, but has type i32/i64
+  case AMDGPUISD::ELSE:
+  case AMDGPUISD::LOOP:
+    // operand #1 must have type i1, but has type i32/i64
+  case AMDGPUISD::LDS:
+    // result #0 must have type i64 (iPTR), but has type i32
+    return;
+  }
+  SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
index 3280be7..bae614a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h
@@ -11,13 +11,49 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "AMDGPUGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace AMDGPUISD {
+
+enum NodeType : unsigned {
+  // Convert a unswizzled wave uniform stack address to an address compatible
+  // with a vector offset for use in stack access.
+  WAVE_ADDRESS = GENERATED_OPCODE_END,
+
+  DOT4,
+  MAD_U64_U32,
+  MAD_I64_I32,
+  TEXTURE_FETCH,
+  R600_EXPORT,
+  CONST_ADDRESS,
 
-class AMDGPUSelectionDAGInfo : public SelectionDAGTargetInfo {
+  /// This node is for VLIW targets and it is used to represent a vector
+  /// that is stored in consecutive registers with the same channel.
+  /// For example:
+  ///   |X  |Y|Z|W|
+  /// T0|v.x| | | |
+  /// T1|v.y| | | |
+  /// T2|v.z| | | |
+  /// T3|v.w| | | |
+  BUILD_VERTICAL_VECTOR,
+
+  DUMMY_CHAIN,
+};
+
+} // namespace AMDGPUISD
+
+class AMDGPUSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  AMDGPUSelectionDAGInfo();
+
   ~AMDGPUSelectionDAGInfo() override;
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  void verifyTargetNode(const SelectionDAG &DAG,
+                        const SDNode *N) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 9af8129..d04dc3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -314,9 +314,7 @@ public:
 #endif
 
   bool empty() const { return Nodes.empty(); }
-  const iterator_range<nodes_iterator> nodes() const {
-    return {Nodes.begin(), Nodes.end()};
-  }
+  iterator_range<nodes_iterator> nodes() const { return Nodes; }
   const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
 
   unsigned getNumNodes() const { return Nodes.size(); }
@@ -993,7 +991,7 @@ void RecursiveSearchSplitting::run() {
   {
     SplitModuleTimer SMT("recursive_search_pick", "partitioning");
     SplitProposal SP(SG, NumParts);
-    pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
+    pickPartition(/*BranchDepth=*/0, /*Idx=*/0, std::move(SP));
   }
 }
 
@@ -1140,7 +1138,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
       LLVM_DEBUG(dbgs().indent(Depth)
                  << " [lb] " << Idx << "=P" << CheapestPID << "? ");
       BranchSP.add(CheapestPID, Cluster);
-      pickPartition(Depth + 1, Idx + 1, BranchSP);
+      pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
     }
 
     // ms = most similar = put in partition with the most in common
@@ -1149,7 +1147,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
       LLVM_DEBUG(dbgs().indent(Depth)
                  << " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
       BranchSP.add(MostSimilarPID, Cluster);
-      pickPartition(Depth + 1, Idx + 1, BranchSP);
+      pickPartition(Depth + 1, Idx + 1, std::move(BranchSP));
     }
 
     return;
@@ -1163,7 +1161,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
   SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
              std::to_string(NumProposalsSubmitted++));
   LLVM_DEBUG(dbgs() << '\n');
-  SubmitProposal(SP);
+  SubmitProposal(std::move(SP));
 }
 
 std::pair<unsigned, CostType>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 26e0b3df..300aca1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -32,16 +32,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-subtarget"
 
-AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
-
-bool AMDGPUSubtarget::useRealTrue16Insts() const {
-  return hasTrue16BitInsts() && EnableRealTrue16Insts;
-}
-
-bool AMDGPUSubtarget::hasD16Writes32BitVgpr() const {
-  return EnableD16Writes32BitVgpr;
-}
-
 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
 // allows the given function to achieve an occupancy of NWaves waves per
 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
@@ -282,7 +272,7 @@ bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
 }
 
 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
-  Function *Kernel = I->getParent()->getParent();
+  Function *Kernel = I->getFunction();
   unsigned MinSize = 0;
   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
   bool IdQuery = false;
@@ -350,7 +340,7 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 }
 
 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
-  assert(AMDGPU::isKernel(F.getCallingConv()));
+  assert(AMDGPU::isKernel(F));
 
   // We don't allocate the segment if we know the implicit arguments weren't
   // used, even if the ABI implies we need them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ed03ef2..302fe7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -42,40 +42,18 @@ public:
     GFX10 = 9,
     GFX11 = 10,
     GFX12 = 11,
+    GFX13 = 12,
   };
 
 private:
   Triple TargetTriple;
 
 protected:
-  bool GCN3Encoding = false;
-  bool Has16BitInsts = false;
-  bool HasTrue16BitInsts = false;
-  bool HasFP8ConversionScaleInsts = false;
-  bool HasBF8ConversionScaleInsts = false;
-  bool HasFP4ConversionScaleInsts = false;
-  bool HasFP6BF6ConversionScaleInsts = false;
-  bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
-  bool HasCvtPkF16F32Inst = false;
-  bool HasF32ToF16BF16ConversionSRInsts = false;
-  bool EnableRealTrue16Insts = false;
-  bool EnableD16Writes32BitVgpr = false;
-  bool HasBF16TransInsts = false;
-  bool HasBF16ConversionInsts = false;
-  bool HasBF16PackedInsts = false;
-  bool HasMadMixInsts = false;
-  bool HasMadMacF32Insts = false;
-  bool HasDsSrc2Insts = false;
-  bool HasSDWA = false;
-  bool HasVOP3PInsts = false;
   bool HasMulI24 = true;
   bool HasMulU24 = true;
   bool HasSMulHi = false;
-  bool HasInv2PiInlineImm = false;
   bool HasFminFmaxLegacy = true;
-  bool EnablePromoteAlloca = false;
-  bool HasTrigReducedRange = false;
-  bool FastFMAF32 = false;
+
   unsigned EUsPerCU = 4;
   unsigned MaxWavesPerEU = 10;
   unsigned LocalMemorySize = 0;
@@ -83,7 +61,7 @@ protected:
   char WavefrontSizeLog2 = 0;
 
 public:
-  AMDGPUSubtarget(Triple TT);
+  AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
 
   static const AMDGPUSubtarget &get(const MachineFunction &MF);
   static const AMDGPUSubtarget &get(const TargetMachine &TM,
@@ -132,13 +110,6 @@ public:
   /// size, register usage, and/or lds usage.
   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
 
-  /// Overload which uses the specified values for the flat work group sizes,
-  /// rather than querying the function itself. \p FlatWorkGroupSizes Should
-  /// correspond to the function's value for getFlatWorkGroupSizes.
-  std::pair<unsigned, unsigned>
-  getWavesPerEU(const Function &F,
-                std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
-
   /// Overload which uses the specified values for the flat workgroup sizes and
   /// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
   /// should correspond to the function's value for getFlatWorkGroupSizes and \p
@@ -206,16 +177,13 @@ public:
 
   bool isGCN() const { return TargetTriple.isAMDGCN(); }
 
-  bool isGCN3Encoding() const {
-    return GCN3Encoding;
-  }
-
-  bool has16BitInsts() const {
-    return Has16BitInsts;
-  }
-
-  /// Return true if the subtarget supports True16 instructions.
-  bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
+  //==---------------------------------------------------------------------===//
+  // TableGen-generated feature getters.
+  //==---------------------------------------------------------------------===//
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  virtual bool GETTER() const { return false; }
+#include "AMDGPUGenSubtargetInfo.inc"
+  //==---------------------------------------------------------------------===//
 
   /// Return true if real (non-fake) variants of True16 instructions using
   /// 16-bit registers should be code-generated. Fake True16 instructions are
@@ -223,56 +191,8 @@ public:
   /// operands and always use their low halves.
   // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
   // supported and the support for fake True16 instructions is removed.
-  bool useRealTrue16Insts() const;
-
-  bool hasD16Writes32BitVgpr() const;
-
-  bool hasBF16TransInsts() const { return HasBF16TransInsts; }
-
-  bool hasBF16ConversionInsts() const {
-    return HasBF16ConversionInsts;
-  }
-
-  bool hasBF16PackedInsts() const { return HasBF16PackedInsts; }
-
-  bool hasMadMixInsts() const {
-    return HasMadMixInsts;
-  }
-
-  bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }
-
-  bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }
-
-  bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
-
-  bool hasFP6BF6ConversionScaleInsts() const {
-    return HasFP6BF6ConversionScaleInsts;
-  }
-
-  bool hasF16BF16ToFP6BF6ConversionScaleInsts() const {
-    return HasF16BF16ToFP6BF6ConversionScaleInsts;
-  }
-
-  bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
-
-  bool hasF32ToF16BF16ConversionSRInsts() const {
-    return HasF32ToF16BF16ConversionSRInsts;
-  }
-
-  bool hasMadMacF32Insts() const {
-    return HasMadMacF32Insts || !isGCN();
-  }
-
-  bool hasDsSrc2Insts() const {
-    return HasDsSrc2Insts;
-  }
-
-  bool hasSDWA() const {
-    return HasSDWA;
-  }
-
-  bool hasVOP3PInsts() const {
-    return HasVOP3PInsts;
+  bool useRealTrue16Insts() const {
+    return hasTrue16BitInsts() && enableRealTrue16Insts();
   }
 
   bool hasMulI24() const {
@@ -287,26 +207,10 @@ public:
     return HasSMulHi;
   }
 
-  bool hasInv2PiInlineImm() const {
-    return HasInv2PiInlineImm;
-  }
-
   bool hasFminFmaxLegacy() const {
     return HasFminFmaxLegacy;
   }
 
-  bool hasTrigReducedRange() const {
-    return HasTrigReducedRange;
-  }
-
-  bool hasFastFMAF32() const {
-    return FastFMAF32;
-  }
-
-  bool isPromoteAllocaEnabled() const {
-    return EnablePromoteAlloca;
-  }
-
   unsigned getWavefrontSize() const {
     return 1 << WavefrontSizeLog2;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4a9437b..3fd554a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -47,8 +47,8 @@
 //        corresponds to offset, second member corresponds to size of LDS global
 //        being replaced and third represents the total aligned size. It will
 //        have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
-//        an intializer with static LDS related offsets and sizes initialized.
-//        But for dynamic LDS related entries, offsets will be intialized to
+//        an initializer with static LDS related offsets and sizes initialized.
+//        But for dynamic LDS related entries, offsets will be initialized to
 //        previous static LDS allocation end offset. Sizes for them will be zero
 //        initially. These dynamic LDS offset and size values will be updated
 //        within the kernel, since kernel can read the dynamic LDS size
@@ -271,7 +271,7 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
       Function *CalledFunc = CallerCGN->getFunction();
       if (!CalledFunc || CalledFunc->isDeclaration())
         continue;
-      if (AMDGPU::isKernelLDS(CalledFunc))
+      if (AMDGPU::isKernel(*CalledFunc))
         continue;
       for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
            AI != E; ++AI) {
@@ -297,7 +297,7 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
     for (User *V : GV->users()) {
       if (auto *I = dyn_cast<Instruction>(V)) {
         Function *F = I->getFunction();
-        if (!isKernelLDS(F) && !F->isDeclaration())
+        if (!isKernel(*F) && !F->isDeclaration())
           FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
       }
     }
@@ -523,7 +523,7 @@ static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
   auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
     auto *V = U.getUser();
     if (auto *Inst = dyn_cast<Instruction>(V)) {
-      auto *Func1 = Inst->getParent()->getParent();
+      auto *Func1 = Inst->getFunction();
       if (Func == Func1)
         return true;
     }
@@ -1169,7 +1169,7 @@ bool AMDGPUSwLowerLDS::run() {
       if (!F || K.second.empty())
         continue;
 
-      assert(isKernelLDS(F));
+      assert(isKernel(*F));
 
       // Only inserts if key isn't already in the map.
       FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a20..49c60c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,9 +17,12 @@
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUBarrierLatency.h"
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
 #include "AMDGPUExportKernelRuntimeHandles.h"
+#include "AMDGPUHazardLatency.h"
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPULowerVGPREncoding.h"
@@ -72,6 +75,7 @@
 #include "llvm/CodeGen/AtomicExpand.h"
 #include "llvm/CodeGen/BranchRelaxation.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
@@ -140,29 +144,36 @@ public:
                            const CGPassBuilderOption &Opts,
                            PassInstrumentationCallbacks *PIC);
 
-  void addIRPasses(AddIRPass &) const;
-  void addCodeGenPrepare(AddIRPass &) const;
-  void addPreISel(AddIRPass &addPass) const;
-  void addILPOpts(AddMachinePass &) const;
-  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
-  Error addInstSelector(AddMachinePass &) const;
-  void addPreRewrite(AddMachinePass &) const;
-  void addMachineSSAOptimization(AddMachinePass &) const;
-  void addPostRegAlloc(AddMachinePass &) const;
-  void addPreEmitPass(AddMachinePass &) const;
-  void addPreEmitRegAlloc(AddMachinePass &) const;
-  Error addRegAssignmentOptimized(AddMachinePass &) const;
-  void addPreRegAlloc(AddMachinePass &) const;
-  void addOptimizedRegAlloc(AddMachinePass &) const;
-  void addPreSched2(AddMachinePass &) const;
+  void addIRPasses(PassManagerWrapper &PMW) const;
+  void addCodeGenPrepare(PassManagerWrapper &PMW) const;
+  void addPreISel(PassManagerWrapper &PMW) const;
+  void addILPOpts(PassManagerWrapper &PMWM) const;
+  void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
+  Error addInstSelector(PassManagerWrapper &PMW) const;
+  void addPreRewrite(PassManagerWrapper &PMW) const;
+  void addMachineSSAOptimization(PassManagerWrapper &PMW) const;
+  void addPostRegAlloc(PassManagerWrapper &PMW) const;
+  void addPreEmitPass(PassManagerWrapper &PMWM) const;
+  void addPreEmitRegAlloc(PassManagerWrapper &PMW) const;
+  Error addRegAssignmentFast(PassManagerWrapper &PMW) const;
+  Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const;
+  void addPreRegAlloc(PassManagerWrapper &PMW) const;
+  Error addFastRegAlloc(PassManagerWrapper &PMW) const;
+  Error addOptimizedRegAlloc(PassManagerWrapper &PMW) const;
+  void addPreSched2(PassManagerWrapper &PMW) const;
+  void addPostBBSections(PassManagerWrapper &PMW) const;
+
+private:
+  Error validateRegAllocOptions() const;
 
+public:
   /// Check if a pass is enabled given \p Opt option. The option always
   /// overrides defaults if explicitly used. Otherwise its default will be used
   /// given that a pass shall work at an optimization \p Level minimum.
   bool isPassEnabled(const cl::opt<bool> &Opt,
                      CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
-  void addEarlyCSEOrGVNPass(AddIRPass &) const;
-  void addStraightLineScalarOptimizationPasses(AddIRPass &) const;
+  void addEarlyCSEOrGVNPass(PassManagerWrapper &PMW) const;
+  void addStraightLineScalarOptimizationPasses(PassManagerWrapper &PMW) const;
 };
 
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
@@ -237,6 +248,63 @@ static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
                 cl::init(&useDefaultRegisterAllocator),
                 cl::desc("Register allocator to use for WWM registers"));
 
+// New pass manager register allocator options for AMDGPU
+static cl::opt<RegAllocType, false, RegAllocTypeParser> SGPRRegAllocNPM(
+    "sgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
+    cl::desc("Register allocator for SGPRs (new pass manager)"));
+
+static cl::opt<RegAllocType, false, RegAllocTypeParser> VGPRRegAllocNPM(
+    "vgpr-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
+    cl::desc("Register allocator for VGPRs (new pass manager)"));
+
+static cl::opt<RegAllocType, false, RegAllocTypeParser> WWMRegAllocNPM(
+    "wwm-regalloc-npm", cl::Hidden, cl::init(RegAllocType::Default),
+    cl::desc("Register allocator for WWM registers (new pass manager)"));
+
+/// Check if the given RegAllocType is supported for AMDGPU NPM register
+/// allocation. Only Fast and Greedy are supported; Basic and PBQP are not.
+static Error checkRegAllocSupported(RegAllocType RAType, StringRef RegName) {
+  if (RAType == RegAllocType::Basic || RAType == RegAllocType::PBQP) {
+    return make_error<StringError>(
+        Twine("unsupported register allocator '") +
+            (RAType == RegAllocType::Basic ? "basic" : "pbqp") + "' for " +
+            RegName + " registers",
+        inconvertibleErrorCode());
+  }
+  return Error::success();
+}
+
+Error AMDGPUCodeGenPassBuilder::validateRegAllocOptions() const {
+  // 1. Generic --regalloc-npm is not supported for AMDGPU.
+  if (Opt.RegAlloc != RegAllocType::Unset) {
+    return make_error<StringError>(
+        "-regalloc-npm not supported for amdgcn. Use -sgpr-regalloc-npm, "
+        "-vgpr-regalloc-npm, and -wwm-regalloc-npm",
+        inconvertibleErrorCode());
+  }
+
+  // 2. Legacy PM regalloc options are not compatible with NPM.
+  if (SGPRRegAlloc.getNumOccurrences() > 0 ||
+      VGPRRegAlloc.getNumOccurrences() > 0 ||
+      WWMRegAlloc.getNumOccurrences() > 0) {
+    return make_error<StringError>(
+        "-sgpr-regalloc, -vgpr-regalloc, and -wwm-regalloc are legacy PM "
+        "options. Use -sgpr-regalloc-npm, -vgpr-regalloc-npm, and "
+        "-wwm-regalloc-npm with the new pass manager",
+        inconvertibleErrorCode());
+  }
+
+  // 3. Only Fast and Greedy allocators are supported for AMDGPU.
+  if (auto Err = checkRegAllocSupported(SGPRRegAllocNPM, "SGPR"))
+    return Err;
+  if (auto Err = checkRegAllocSupported(WWMRegAllocNPM, "WWM"))
+    return Err;
+  if (auto Err = checkRegAllocSupported(VGPRRegAllocNPM, "VGPR"))
+    return Err;
+
+  return Error::success();
+}
+
 static void initializeDefaultSGPRRegisterAllocatorOnce() {
   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
 
@@ -464,6 +532,11 @@ static cl::opt<bool> EnableScalarIRPasses(
   cl::init(true),
   cl::Hidden);
 
+static cl::opt<bool> EnableLowerExecSync(
+    "amdgpu-enable-lower-exec-sync",
+    cl::desc("Enable lowering of execution synchronization."), cl::init(true),
+    cl::Hidden);
+
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
                      cl::desc("Enable lowering of lds to global memory pass "
@@ -566,9 +639,10 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILoadStoreOptimizerLegacyPass(*PR);
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
+  initializeAMDGPULowerExecSyncLegacyPass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
-  initializeAMDGPUArgumentUsageInfoPass(*PR);
+  initializeAMDGPUArgumentUsageInfoWrapperLegacyPass(*PR);
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
@@ -618,6 +692,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
   initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
   initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
+  initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -639,6 +714,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+  DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
   return DAG;
 }
 
@@ -659,6 +736,8 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+  DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
   return DAG;
 }
 
@@ -737,7 +816,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
   return "r600";
 }
 
-static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+static Reloc::Model getEffectiveRelocModel() {
   // The AMDGPU toolchain only supports generating shared objects, so we
   // must always use PIC.
   return Reloc::PIC_;
@@ -751,8 +830,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOptLevel OptLevel)
     : CodeGenTargetMachineImpl(
           T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options,
-          getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
+          getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small),
+          OptLevel),
       TLOF(createTLOF(getTargetTriple())) {
   initAsmInfo();
   if (TT.isAMDGCN()) {
@@ -802,7 +881,8 @@ static bool mustPreserveGV(const GlobalValue &GV) {
 }
 
 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
-  AAM.registerFunctionAnalysis<AMDGPUAA>();
+  if (EnableAMDGPUAliasAnalysis)
+    AAM.registerFunctionAnalysis<AMDGPUAA>();
 }
 
 static Expected<ScanOptions>
@@ -812,7 +892,7 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
   Params.consume_front("strategy=");
   auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
                     .Case("dpp", ScanOptions::DPP)
-                    .Cases("iterative", "", ScanOptions::Iterative)
+                    .Cases({"iterative", ""}, ScanOptions::Iterative)
                     .Case("none", ScanOptions::None)
                     .Default(std::nullopt);
   if (Result)
@@ -884,9 +964,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         if (EarlyInlineAll && !EnableFunctionCalls)
           PM.addPass(AMDGPUAlwaysInlinePass());
-
-        if (EnableUniformIntrinsicCombine)
-          PM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerPeepholeEPCallback(
@@ -897,6 +974,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         FPM.addPass(AMDGPUUseNativeCallsPass());
         if (EnableLibCallSimplify)
           FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+        if (EnableUniformIntrinsicCombine)
+          FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerCGSCCOptimizerLateEPCallback(
@@ -958,6 +1038,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.
+        if (EnableLowerExecSync)
+          PM.addPass(AMDGPULowerExecSyncPass());
         if (EnableSwLowerLDS)
           PM.addPass(AMDGPUSwLowerLDSPass(*this));
         if (EnableLowerModuleLDS)
@@ -1197,6 +1279,8 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
       EnableVOPD)
     DAG->addMutation(createVOPDPairingMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF));
+  DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF));
   return DAG;
 }
 //===----------------------------------------------------------------------===//
@@ -1213,10 +1297,6 @@ class GCNPassConfig final : public AMDGPUPassConfig {
 public:
   GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
       : AMDGPUPassConfig(TM, PM) {
-    // It is necessary to know the register usage of the entire call graph.  We
-    // allow calls without EnableAMDGPUFunctionCalls if they are marked
-    // noinline, so this is always required.
-    setRequiresCodeGenSCCOrder(true);
     substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
   }
 
@@ -1310,6 +1390,9 @@ void AMDGPUPassConfig::addIRPasses() {
       isPassEnabled(EnableImageIntrinsicOptimizer))
     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
 
+  if (EnableUniformIntrinsicCombine)
+    addPass(createAMDGPUUniformIntrinsicCombineLegacyPass());
+
   // This can be disabled by passing ::Disable here or on the command line
   // with --expand-variadics-override=disable.
   addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
@@ -1325,6 +1408,10 @@ void AMDGPUPassConfig::addIRPasses() {
   // Make enqueued block runtime handles externally visible.
   addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass());
 
+  // Lower special LDS accesses.
+  if (EnableLowerExecSync)
+    addPass(createAMDGPULowerExecSyncLegacyPass());
+
   // Lower LDS accesses to global memory pass if address sanitizer is enabled.
   if (EnableSwLowerLDS)
     addPass(createAMDGPUSwLowerLDSLegacyPass(&TM));
@@ -1410,9 +1497,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     // many cases.
     addPass(createAMDGPULowerBufferFatPointersPass());
     addPass(createAMDGPULowerIntrinsicsLegacyPass());
-    // In accordance with the above FIXME, manually force all the
-    // function-level passes into a CGSCCPassManager.
-    addPass(new DummyCGSCCPass());
   }
 
   // LowerSwitch pass may introduce unreachable blocks that can
@@ -2007,6 +2091,42 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
     return true;
 
+  // Parse FirstKernArgPreloadReg separately, since it's a Register,
+  // not ArgDescriptor.
+  if (YamlMFI.ArgInfo && YamlMFI.ArgInfo->FirstKernArgPreloadReg) {
+    const yaml::SIArgument &A = *YamlMFI.ArgInfo->FirstKernArgPreloadReg;
+
+    if (!A.IsRegister) {
+      // For stack arguments, we don't have RegisterName.SourceRange,
+      // but we should have some location info from the YAML parser
+      const MemoryBuffer &Buffer =
+          *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
+      // Create a minimal valid source range
+      SMLoc Loc = SMLoc::getFromPointer(Buffer.getBufferStart());
+      SMRange Range(Loc, Loc);
+
+      Error = SMDiagnostic(
+          *PFS.SM, Loc, Buffer.getBufferIdentifier(), 1, 0, SourceMgr::DK_Error,
+          "firstKernArgPreloadReg must be a register, not a stack location", "",
+          {}, {});
+
+      SourceRange = Range;
+      return true;
+    }
+
+    Register Reg;
+    if (parseNamedRegisterReference(PFS, Reg, A.RegisterName.Value, Error)) {
+      SourceRange = A.RegisterName.SourceRange;
+      return true;
+    }
+
+    if (!AMDGPU::SGPR_32RegClass.contains(Reg))
+      return diagnoseRegisterClass(A.RegisterName);
+
+    MFI->ArgInfo.FirstKernArgPreloadReg = Reg;
+    MFI->NumUserSGPRs += YamlMFI.NumKernargPreloadSGPRs;
+  }
+
   if (ST.hasIEEEMode())
     MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
   if (ST.hasDX10ClampMode())
@@ -2046,63 +2166,74 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
   // Exceptions and StackMaps are not supported, so these passes will never do
   // anything.
   // Garbage collection is not supported.
-  disablePass<StackMapLivenessPass, FuncletLayoutPass,
-              ShadowStackGCLoweringPass>();
+  disablePass<StackMapLivenessPass, FuncletLayoutPass, PatchableFunctionPass,
+              ShadowStackGCLoweringPass, GCLoweringPass>();
 }
 
-void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
-  if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
-    addPass(AMDGPURemoveIncompatibleFunctionsPass(TM));
+void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
+  if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN()) {
+    flushFPMsToMPM(PMW);
+    addModulePass(AMDGPURemoveIncompatibleFunctionsPass(TM), PMW);
+  }
 
-  addPass(AMDGPUPrintfRuntimeBindingPass());
+  flushFPMsToMPM(PMW);
+  addModulePass(AMDGPUPrintfRuntimeBindingPass(), PMW);
   if (LowerCtorDtor)
-    addPass(AMDGPUCtorDtorLoweringPass());
+    addModulePass(AMDGPUCtorDtorLoweringPass(), PMW);
 
   if (isPassEnabled(EnableImageIntrinsicOptimizer))
-    addPass(AMDGPUImageIntrinsicOptimizerPass(TM));
+    addFunctionPass(AMDGPUImageIntrinsicOptimizerPass(TM), PMW);
 
+  if (EnableUniformIntrinsicCombine)
+    addFunctionPass(AMDGPUUniformIntrinsicCombinePass(), PMW);
   // This can be disabled by passing ::Disable here or on the command line
   // with --expand-variadics-override=disable.
-  addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering));
+  flushFPMsToMPM(PMW);
+  addModulePass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering), PMW);
 
-  addPass(AMDGPUAlwaysInlinePass());
-  addPass(AlwaysInlinerPass());
+  addModulePass(AMDGPUAlwaysInlinePass(), PMW);
+  addModulePass(AlwaysInlinerPass(), PMW);
 
-  addPass(AMDGPUExportKernelRuntimeHandlesPass());
+  addModulePass(AMDGPUExportKernelRuntimeHandlesPass(), PMW);
+
+  if (EnableLowerExecSync)
+    addModulePass(AMDGPULowerExecSyncPass(), PMW);
 
   if (EnableSwLowerLDS)
-    addPass(AMDGPUSwLowerLDSPass(TM));
+    addModulePass(AMDGPUSwLowerLDSPass(TM), PMW);
 
   // Runs before PromoteAlloca so the latter can account for function uses
   if (EnableLowerModuleLDS)
-    addPass(AMDGPULowerModuleLDSPass(TM));
+    addModulePass(AMDGPULowerModuleLDSPass(TM), PMW);
 
   // Run atomic optimizer before Atomic Expand
   if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
-    addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
+    addFunctionPass(
+        AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy), PMW);
 
-  addPass(AtomicExpandPass(&TM));
+  addFunctionPass(AtomicExpandPass(TM), PMW);
 
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    addPass(AMDGPUPromoteAllocaPass(TM));
+    addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
     if (isPassEnabled(EnableScalarIRPasses))
-      addStraightLineScalarOptimizationPasses(addPass);
+      addStraightLineScalarOptimizationPasses(PMW);
 
     // TODO: Handle EnableAMDGPUAliasAnalysis
 
     // TODO: May want to move later or split into an early and late one.
-    addPass(AMDGPUCodeGenPreparePass(TM));
+    addFunctionPass(AMDGPUCodeGenPreparePass(TM), PMW);
 
     // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
     // have expanded.
     if (TM.getOptLevel() > CodeGenOptLevel::Less) {
-      addPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()),
-                                              /*UseMemorySSA=*/true));
+      addFunctionPass(createFunctionToLoopPassAdaptor(LICMPass(LICMOptions()),
+                                                      /*UseMemorySSA=*/true),
+                      PMW);
     }
   }
 
-  Base::addIRPasses(addPass);
+  Base::addIRPasses(PMW);
 
   // EarlyCSE is not always strong enough to clean up what LSR produces. For
   // example, GVN can combine
@@ -2117,20 +2248,23 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
   //
   // but EarlyCSE can do neither of them.
   if (isPassEnabled(EnableScalarIRPasses))
-    addEarlyCSEOrGVNPass(addPass);
+    addEarlyCSEOrGVNPass(PMW);
 }
 
-void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
-  if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addPass(AMDGPUPreloadKernelArgumentsPass(TM));
+void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(
+    PassManagerWrapper &PMW) const {
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    flushFPMsToMPM(PMW);
+    addModulePass(AMDGPUPreloadKernelArgumentsPass(TM), PMW);
+  }
 
   if (EnableLowerKernelArguments)
-    addPass(AMDGPULowerKernelArgumentsPass(TM));
+    addFunctionPass(AMDGPULowerKernelArgumentsPass(TM), PMW);
 
-  Base::addCodeGenPrepare(addPass);
+  Base::addCodeGenPrepare(PMW);
 
   if (isPassEnabled(EnableLoadStoreVectorizer))
-    addPass(LoadStoreVectorizerPass());
+    addFunctionPass(LoadStoreVectorizerPass(), PMW);
 
   // This lowering has been placed after codegenprepare to take advantage of
   // address mode matching (which is why it isn't put with the LDS lowerings).
@@ -2139,102 +2273,160 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   // but has been put before switch lowering and CFG flattening so that those
   // passes can run on the more optimized control flow this pass creates in
   // many cases.
-  addPass(AMDGPULowerBufferFatPointersPass(TM));
-  addPass.requireCGSCCOrder();
+  flushFPMsToMPM(PMW);
+  addModulePass(AMDGPULowerBufferFatPointersPass(TM), PMW);
+  flushFPMsToMPM(PMW);
+  requireCGSCCOrder(PMW);
 
-  addPass(AMDGPULowerIntrinsicsPass(TM));
+  addModulePass(AMDGPULowerIntrinsicsPass(TM), PMW);
 
   // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
   // behavior for subsequent passes. Placing it here seems better that these
   // blocks would get cleaned up by UnreachableBlockElim inserted next in the
   // pass flow.
-  addPass(LowerSwitchPass());
+  addFunctionPass(LowerSwitchPass(), PMW);
 }
 
-void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
+
+  // Require AMDGPUArgumentUsageAnalysis so that it's available during ISel.
+  flushFPMsToMPM(PMW);
+  addModulePass(RequireAnalysisPass<AMDGPUArgumentUsageAnalysis, Module>(),
+                PMW);
 
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    addPass(FlattenCFGPass());
-    addPass(SinkingPass());
-    addPass(AMDGPULateCodeGenPreparePass(TM));
+    addFunctionPass(FlattenCFGPass(), PMW);
+    addFunctionPass(SinkingPass(), PMW);
+    addFunctionPass(AMDGPULateCodeGenPreparePass(TM), PMW);
   }
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
 
-  addPass(AMDGPUUnifyDivergentExitNodesPass());
-  addPass(FixIrreduciblePass());
-  addPass(UnifyLoopExitsPass());
-  addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false));
+  addFunctionPass(AMDGPUUnifyDivergentExitNodesPass(), PMW);
+  addFunctionPass(FixIrreduciblePass(), PMW);
+  addFunctionPass(UnifyLoopExitsPass(), PMW);
+  addFunctionPass(StructurizeCFGPass(/*SkipUniformRegions=*/false), PMW);
 
-  addPass(AMDGPUAnnotateUniformValuesPass());
+  addFunctionPass(AMDGPUAnnotateUniformValuesPass(), PMW);
 
-  addPass(SIAnnotateControlFlowPass(TM));
+  addFunctionPass(SIAnnotateControlFlowPass(TM), PMW);
 
   // TODO: Move this right after structurizeCFG to avoid extra divergence
   // analysis. This depends on stopping SIAnnotateControlFlow from making
   // control flow modifications.
-  addPass(AMDGPURewriteUndefForPHIPass());
+  addFunctionPass(AMDGPURewriteUndefForPHIPass(), PMW);
 
   if (!getCGPassBuilderOption().EnableGlobalISelOption ||
       !isGlobalISelAbortEnabled() || !NewRegBankSelect)
-    addPass(LCSSAPass());
+    addFunctionPass(LCSSAPass(), PMW);
 
-  if (TM.getOptLevel() > CodeGenOptLevel::Less)
-    addPass(AMDGPUPerfHintAnalysisPass(TM));
+  if (TM.getOptLevel() > CodeGenOptLevel::Less) {
+    flushFPMsToMPM(PMW);
+    addModulePass(AMDGPUPerfHintAnalysisPass(TM), PMW);
+  }
 
   // FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
   // isn't this in addInstSelector?
-  addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>(),
-          /*Force=*/true);
+  addFunctionPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>(), PMW,
+                  /*Force=*/true);
 }
 
-void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addILPOpts(PassManagerWrapper &PMW) const {
   if (EnableEarlyIfConversion)
-    addPass(EarlyIfConverterPass());
+    addMachineFunctionPass(EarlyIfConverterPass(), PMW);
 
-  Base::addILPOpts(addPass);
+  Base::addILPOpts(PMW);
 }
 
-void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
+void AMDGPUCodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW,
                                              CreateMCStreamer) const {
   // TODO: Add AsmPrinter.
 }
 
-Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
-  addPass(AMDGPUISelDAGToDAGPass(TM));
-  addPass(SIFixSGPRCopiesPass());
-  addPass(SILowerI1CopiesPass());
+Error AMDGPUCodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
+  addMachineFunctionPass(AMDGPUISelDAGToDAGPass(TM), PMW);
+  addMachineFunctionPass(SIFixSGPRCopiesPass(), PMW);
+  addMachineFunctionPass(SILowerI1CopiesPass(), PMW);
   return Error::success();
 }
 
-void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreRewrite(PassManagerWrapper &PMW) const {
   if (EnableRegReassign) {
-    addPass(GCNNSAReassignPass());
+    addMachineFunctionPass(GCNNSAReassignPass(), PMW);
   }
+
+  addMachineFunctionPass(AMDGPURewriteAGPRCopyMFMAPass(), PMW);
 }
 
 void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
-    AddMachinePass &addPass) const {
-  Base::addMachineSSAOptimization(addPass);
+    PassManagerWrapper &PMW) const {
+  Base::addMachineSSAOptimization(PMW);
 
-  addPass(SIFoldOperandsPass());
+  addMachineFunctionPass(SIFoldOperandsPass(), PMW);
   if (EnableDPPCombine) {
-    addPass(GCNDPPCombinePass());
+    addMachineFunctionPass(GCNDPPCombinePass(), PMW);
   }
-  addPass(SILoadStoreOptimizerPass());
+  addMachineFunctionPass(SILoadStoreOptimizerPass(), PMW);
   if (isPassEnabled(EnableSDWAPeephole)) {
-    addPass(SIPeepholeSDWAPass());
-    addPass(EarlyMachineLICMPass());
-    addPass(MachineCSEPass());
-    addPass(SIFoldOperandsPass());
+    addMachineFunctionPass(SIPeepholeSDWAPass(), PMW);
+    addMachineFunctionPass(EarlyMachineLICMPass(), PMW);
+    addMachineFunctionPass(MachineCSEPass(), PMW);
+    addMachineFunctionPass(SIFoldOperandsPass(), PMW);
   }
-  addPass(DeadMachineInstructionElimPass());
-  addPass(SIShrinkInstructionsPass());
+  addMachineFunctionPass(DeadMachineInstructionElimPass(), PMW);
+  addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
+}
+
+Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const {
+  insertPass<PHIEliminationPass>(SILowerControlFlowPass());
+
+  insertPass<TwoAddressInstructionPass>(SIWholeQuadModePass());
+
+  return Base::addFastRegAlloc(PMW);
 }
 
-void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
-    AddMachinePass &addPass) const {
+Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast(
+    PassManagerWrapper &PMW) const {
+  if (auto Err = validateRegAllocOptions())
+    return Err;
+
+  addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
+
+  // SGPR allocation - default to fast at -O0.
+  if (SGPRRegAllocNPM == RegAllocType::Greedy)
+    addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
+  else
+    addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
+                           PMW);
+
+  // Equivalent of PEI for SGPRs.
+  addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
+
+  // To Allocate wwm registers used in whole quad mode operations (for shaders).
+  addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
+
+  // WWM allocation - default to fast at -O0.
+  if (WWMRegAllocNPM == RegAllocType::Greedy)
+    addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
+  else
+    addMachineFunctionPass(
+        RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
+
+  addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
+  addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
+
+  // VGPR allocation - default to fast at -O0.
+  if (VGPRRegAllocNPM == RegAllocType::Greedy)
+    addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
+  else
+    addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
+
+  return Error::success();
+}
+
+Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
+    PassManagerWrapper &PMW) const {
   if (EnableDCEInRA)
     insertPass<DetectDeadLanesPass>(DeadMachineInstructionElimPass());
 
@@ -2269,90 +2461,108 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   if (TM.getOptLevel() > CodeGenOptLevel::Less)
     insertPass<MachineSchedulerPass>(SIFormMemoryClausesPass());
 
-  Base::addOptimizedRegAlloc(addPass);
+  return Base::addOptimizedRegAlloc(PMW);
 }
 
-void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreRegAlloc(PassManagerWrapper &PMW) const {
   if (getOptLevel() != CodeGenOptLevel::None)
-    addPass(AMDGPUPrepareAGPRAllocPass());
+    addMachineFunctionPass(AMDGPUPrepareAGPRAllocPass(), PMW);
 }
 
 Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
-    AddMachinePass &addPass) const {
-  // TODO: Check --regalloc-npm option
+    PassManagerWrapper &PMW) const {
+  if (auto Err = validateRegAllocOptions())
+    return Err;
 
-  addPass(GCNPreRALongBranchRegPass());
+  addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW);
 
-  addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}));
+  // SGPR allocation - default to greedy at -O1 and above.
+  if (SGPRRegAllocNPM == RegAllocType::Fast)
+    addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}),
+                           PMW);
+  else
+    addMachineFunctionPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}), PMW);
 
   // Commit allocated register changes. This is mostly necessary because too
   // many things rely on the use lists of the physical registers, such as the
   // verifier. This is only necessary with allocators which use LiveIntervals,
   // since FastRegAlloc does the replacements itself.
-  addPass(VirtRegRewriterPass(false));
+  addMachineFunctionPass(VirtRegRewriterPass(false), PMW);
 
   // At this point, the sgpr-regalloc has been done and it is good to have the
   // stack slot coloring to try to optimize the SGPR spill stack indices before
   // attempting the custom SGPR spill lowering.
-  addPass(StackSlotColoringPass());
+  addMachineFunctionPass(StackSlotColoringPass(), PMW);
 
   // Equivalent of PEI for SGPRs.
-  addPass(SILowerSGPRSpillsPass());
+  addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW);
 
   // To Allocate wwm registers used in whole quad mode operations (for shaders).
-  addPass(SIPreAllocateWWMRegsPass());
-
-  // For allocating other wwm register operands.
-  addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}));
-  addPass(SILowerWWMCopiesPass());
-  addPass(VirtRegRewriterPass(false));
-  addPass(AMDGPUReserveWWMRegsPass());
-
-  // For allocating per-thread VGPRs.
-  addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
+  addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW);
 
+  // WWM allocation - default to greedy at -O1 and above.
+  if (WWMRegAllocNPM == RegAllocType::Fast)
+    addMachineFunctionPass(
+        RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), PMW);
+  else
+    addMachineFunctionPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}), PMW);
+  addMachineFunctionPass(SILowerWWMCopiesPass(), PMW);
+  addMachineFunctionPass(VirtRegRewriterPass(false), PMW);
+  addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW);
+
+  // VGPR allocation - default to greedy at -O1 and above.
+  if (VGPRRegAllocNPM == RegAllocType::Fast)
+    addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW);
+  else
+    addMachineFunctionPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}), PMW);
 
-  addPreRewrite(addPass);
-  addPass(VirtRegRewriterPass(true));
+  addPreRewrite(PMW);
+  addMachineFunctionPass(VirtRegRewriterPass(true), PMW);
 
-  addPass(AMDGPUMarkLastScratchLoadPass());
+  addMachineFunctionPass(AMDGPUMarkLastScratchLoadPass(), PMW);
   return Error::success();
 }
 
-void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
-  addPass(SIFixVGPRCopiesPass());
+void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
+  addMachineFunctionPass(SIFixVGPRCopiesPass(), PMW);
   if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addPass(SIOptimizeExecMaskingPass());
-  Base::addPostRegAlloc(addPass);
+    addMachineFunctionPass(SIOptimizeExecMaskingPass(), PMW);
+  Base::addPostRegAlloc(PMW);
 }
 
-void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPreSched2(PassManagerWrapper &PMW) const {
   if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addPass(SIShrinkInstructionsPass());
-  addPass(SIPostRABundlerPass());
+    addMachineFunctionPass(SIShrinkInstructionsPass(), PMW);
+  addMachineFunctionPass(SIPostRABundlerPass(), PMW);
 }
 
-void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addPostBBSections(
+    PassManagerWrapper &PMW) const {
+  // We run this later to avoid passes like livedebugvalues and BBSections
+  // having to deal with the apparent multi-entry functions we may generate.
+  addMachineFunctionPass(AMDGPUPreloadKernArgPrologPass(), PMW);
+}
+
+void AMDGPUCodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
   if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
-    addPass(GCNCreateVOPDPass());
+    addMachineFunctionPass(GCNCreateVOPDPass(), PMW);
   }
 
-  addPass(SIMemoryLegalizerPass());
-  addPass(SIInsertWaitcntsPass());
+  addMachineFunctionPass(SIMemoryLegalizerPass(), PMW);
+  addMachineFunctionPass(SIInsertWaitcntsPass(), PMW);
 
-  // TODO: addPass(SIModeRegisterPass());
+  addMachineFunctionPass(SIModeRegisterPass(), PMW);
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    // TODO: addPass(SIInsertHardClausesPass());
-  }
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
+    addMachineFunctionPass(SIInsertHardClausesPass(), PMW);
 
-  addPass(SILateBranchLoweringPass());
+  addMachineFunctionPass(SILateBranchLoweringPass(), PMW);
 
   if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
-    addPass(AMDGPUSetWavePriorityPass());
+    addMachineFunctionPass(AMDGPUSetWavePriorityPass(), PMW);
 
   if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addPass(SIPreEmitPeepholePass());
+    addMachineFunctionPass(SIPreEmitPeepholePass(), PMW);
 
   // The hazard recognizer that runs as part of the post-ra scheduler does not
   // guarantee to be able handle all hazards correctly. This is because if there
@@ -2362,15 +2572,15 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
   //
   // Here we add a stand-alone hazard recognizer pass which can handle all
   // cases.
-  addPass(PostRAHazardRecognizerPass());
-  addPass(AMDGPUWaitSGPRHazardsPass());
-  addPass(AMDGPULowerVGPREncodingPass());
+  addMachineFunctionPass(PostRAHazardRecognizerPass(), PMW);
+  addMachineFunctionPass(AMDGPUWaitSGPRHazardsPass(), PMW);
+  addMachineFunctionPass(AMDGPULowerVGPREncodingPass(), PMW);
 
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) {
-    addPass(AMDGPUInsertDelayAluPass());
+    addMachineFunctionPass(AMDGPUInsertDelayAluPass(), PMW);
   }
 
-  addPass(BranchRelaxationPass());
+  addMachineFunctionPass(BranchRelaxationPass(), PMW);
 }
 
 bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
@@ -2382,32 +2592,33 @@ bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
   return Opt;
 }
 
-void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const {
+void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(
+    PassManagerWrapper &PMW) const {
   if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
-    addPass(GVNPass());
+    addFunctionPass(GVNPass(), PMW);
   else
-    addPass(EarlyCSEPass());
+    addFunctionPass(EarlyCSEPass(), PMW);
 }
 
 void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
-    AddIRPass &addPass) const {
+    PassManagerWrapper &PMW) const {
   if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
-    addPass(LoopDataPrefetchPass());
+    addFunctionPass(LoopDataPrefetchPass(), PMW);
 
-  addPass(SeparateConstOffsetFromGEPPass());
+  addFunctionPass(SeparateConstOffsetFromGEPPass(), PMW);
 
   // ReassociateGEPs exposes more opportunities for SLSR. See
   // the example in reassociate-geps-and-slsr.ll.
-  addPass(StraightLineStrengthReducePass());
+  addFunctionPass(StraightLineStrengthReducePass(), PMW);
 
   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
   // EarlyCSE can reuse.
-  addEarlyCSEOrGVNPass(addPass);
+  addEarlyCSEOrGVNPass(PMW);
 
   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
-  addPass(NaryReassociatePass());
+  addFunctionPass(NaryReassociatePass(), PMW);
 
   // NaryReassociate on GEPs creates redundant common expressions, so run
   // EarlyCSE after it.
-  addPass(EarlyCSEPass());
+  addFunctionPass(EarlyCSEPass(), PMW);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fd..d4a6838 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB(
 static cl::opt<unsigned> MemcpyLoopUnroll(
     "amdgpu-memcpy-loop-unroll",
     cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
-             "operations when lowering memcpy as a loop"),
+             "operations when lowering statically-sized memcpy, memmove, or"
+             "memset as a loop"),
     cl::init(16), cl::Hidden);
 
 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
@@ -206,9 +207,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(
             dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
         if (!Alloca || !Alloca->isStaticAlloca())
           continue;
-        Type *Ty = Alloca->getAllocatedType();
-        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
-        if (AllocaSize > MaxAlloca)
+        auto AllocaSize = Alloca->getAllocationSize(DL);
+        if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
           continue;
       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
                  AS == AMDGPUAS::REGION_ADDRESS) {
@@ -285,7 +285,7 @@ uint64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
     // Codegen control options which don't matter.
     AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
-    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
+    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
     AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
     AMDGPU::FeatureUnalignedAccessMode,
 
@@ -300,7 +300,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
     AMDGPU::FeatureSRAMECC,
 
     // Perf-tuning features
-    AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
+    AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
 
 GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
     : BaseT(TM, F.getDataLayout()),
@@ -804,7 +804,7 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       InstRate = getFullRateInstrCost();
 
     static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
-    if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
+    if (any_of(ValidSatTys, equal_to(LT.second)))
       NElts = 1;
     break;
   }
@@ -883,10 +883,9 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
   return LT.first * getHalfRateInstrCost(CostKind);
 }
 
-InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                               TTI::TargetCostKind CostKind,
-                                               unsigned Index, const Value *Op0,
-                                               const Value *Op1) const {
+InstructionCost GCNTTIImpl::getVectorInstrCost(
+    unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
+    const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
   switch (Opcode) {
   case Instruction::ExtractElement:
   case Instruction::InsertElement: {
@@ -895,8 +894,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     if (EltSize < 32) {
       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
         return 0;
-      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
-                                       Op1);
+      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+                                       VIC);
     }
 
     // Extracts are just reads of a subregister, so are free. Inserts are
@@ -907,7 +906,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     return Index == ~0u ? 2 : 0;
   }
   default:
-    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
+    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+                                     VIC);
   }
 }
 
@@ -1150,41 +1150,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
     return NewVal;
   }
-  case Intrinsic::ptrmask: {
-    unsigned OldAS = OldV->getType()->getPointerAddressSpace();
-    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
-    Value *MaskOp = II->getArgOperand(1);
-    Type *MaskTy = MaskOp->getType();
-
-    bool DoTruncate = false;
-
-    const GCNTargetMachine &TM =
-        static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
-    if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
-      // All valid 64-bit to 32-bit casts work by chopping off the high
-      // bits. Any masking only clearing the low bits will also apply in the new
-      // address space.
-      if (DL.getPointerSizeInBits(OldAS) != 64 ||
-          DL.getPointerSizeInBits(NewAS) != 32)
-        return nullptr;
-
-      // TODO: Do we need to thread more context in here?
-      KnownBits Known = computeKnownBits(MaskOp, DL, nullptr, II);
-      if (Known.countMinLeadingOnes() < 32)
-        return nullptr;
-
-      DoTruncate = true;
-    }
-
-    IRBuilder<> B(II);
-    if (DoTruncate) {
-      MaskTy = B.getInt32Ty();
-      MaskOp = B.CreateTrunc(MaskOp, MaskTy);
-    }
-
-    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
-                             {NewV, MaskOp});
-  }
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
   case Intrinsic::amdgcn_flat_atomic_fmin_num: {
     Type *DestTy = II->getType();
@@ -1241,46 +1206,123 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       (ScalarSize == 16 || ScalarSize == 8)) {
     // Larger vector widths may require additional instructions, but are
     // typically cheaper than scalarized versions.
-    unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
-    unsigned RequestedElts =
-        count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+    //
+    // We assume that shuffling at a register granularity can be done for free.
+    // This is not true for vectors fed into memory instructions, but it is
+    // effectively true for all other shuffling. The emphasis of the logic here
+    // is to assist generic transform in cleaning up / canonicalizing those
+    // shuffles.
+
+    // With op_sel VOP3P instructions freely can access the low half or high
+    // half of a register, so any swizzle of two elements is free.
+    if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
+      unsigned NumSrcElts = SrcVecTy->getNumElements();
+      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
+          (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
+           Kind == TTI::SK_PermuteSingleSrc))
+        return 0;
+    }
+
     unsigned EltsPerReg = 32 / ScalarSize;
-    if (RequestedElts == 0)
-      return 0;
     switch (Kind) {
     case TTI::SK_Broadcast:
+      // A single v_perm_b32 can be re-used for all destination registers.
+      return 1;
     case TTI::SK_Reverse:
-    case TTI::SK_PermuteSingleSrc: {
-      // With op_sel VOP3P instructions freely can access the low half or high
-      // half of a register, so any swizzle of two elements is free.
-      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
-        return 0;
-      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
-      // SK_Broadcast just reuses the same mask
-      unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
-      return NumPerms + NumPermMasks;
-    }
+      // One instruction per register.
+      if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
+        return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
+      return InstructionCost::getInvalid();
     case TTI::SK_ExtractSubvector:
+      if (Index % EltsPerReg == 0)
+        return 0; // Shuffling at register granularity
+      if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
+        return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
+      return InstructionCost::getInvalid();
     case TTI::SK_InsertSubvector: {
-      // Even aligned accesses are free
-      if (!(Index % 2))
-        return 0;
-      // Insert/extract subvectors only require shifts / extract code to get the
-      // relevant bits
-      return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
+      auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+      if (!DstVecTy)
+        return InstructionCost::getInvalid();
+      unsigned NumDstElts = DstVecTy->getNumElements();
+      unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
+      unsigned EndIndex = Index + NumInsertElts;
+      unsigned BeginSubIdx = Index % EltsPerReg;
+      unsigned EndSubIdx = EndIndex % EltsPerReg;
+      unsigned Cost = 0;
+
+      if (BeginSubIdx != 0) {
+        // Need to shift the inserted vector into place. The cost is the number
+        // of destination registers overlapped by the inserted vector.
+        Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
+      }
+
+      // If the last register overlap is partial, there may be three source
+      // registers feeding into it; that takes an extra instruction.
+      if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
+        Cost += 1;
+
+      return Cost;
     }
-    case TTI::SK_PermuteTwoSrc:
-    case TTI::SK_Splice:
-    case TTI::SK_Select: {
-      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
-      // SK_Select just reuses the same mask
-      unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
-      return NumPerms + NumPermMasks;
+    case TTI::SK_Splice: {
+      auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+      if (!DstVecTy)
+        return InstructionCost::getInvalid();
+      unsigned NumElts = DstVecTy->getNumElements();
+      assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
+      // Determine the sub-region of the result vector that requires
+      // sub-register shuffles / mixing.
+      unsigned EltsFromLHS = NumElts - Index;
+      bool LHSIsAligned = (Index % EltsPerReg) == 0;
+      bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
+      if (LHSIsAligned && RHSIsAligned)
+        return 0;
+      if (LHSIsAligned && !RHSIsAligned)
+        return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
+      if (!LHSIsAligned && RHSIsAligned)
+        return divideCeil(EltsFromLHS, EltsPerReg);
+      return divideCeil(NumElts, EltsPerReg);
     }
-
     default:
       break;
     }
+
+    if (!Mask.empty()) {
+      unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
+
+      // Generically estimate the cost by assuming that each destination
+      // register is derived from sources via v_perm_b32 instructions if it
+      // can't be copied as-is.
+      //
+      // For each destination register, derive the cost of obtaining it based
+      // on the number of source registers that feed into it.
+      unsigned Cost = 0;
+      for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
+        SmallVector<int, 4> Regs;
+        bool Aligned = true;
+        for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
+          int SrcIdx = Mask[DstIdx + I];
+          if (SrcIdx == -1)
+            continue;
+          int Reg;
+          if (SrcIdx < (int)NumSrcElts) {
+            Reg = SrcIdx / EltsPerReg;
+            if (SrcIdx % EltsPerReg != I)
+              Aligned = false;
+          } else {
+            Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
+            if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
+              Aligned = false;
+          }
+          if (!llvm::is_contained(Regs, Reg))
+            Regs.push_back(Reg);
+        }
+        if (Regs.size() >= 2)
+          Cost += Regs.size() - 1;
+        else if (!Aligned)
+          Cost += 1;
+      }
+      return Cost;
+    }
   }
 
   return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
@@ -1299,8 +1341,60 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
     if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
       continue;
 
-    if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
+    if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
       Ops.push_back(&Op);
+      continue;
+    }
+
+    // Check for zero-cost multiple use InsertElement/ExtractElement
+    // instructions
+    if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
+      if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
+        Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
+        if (VecOpInst && VecOpInst->hasOneUse())
+          continue;
+
+        if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
+                               TTI::TCK_RecipThroughput, 0,
+                               OpInst->getOperand(0),
+                               OpInst->getOperand(1)) == 0) {
+          Ops.push_back(&Op);
+          continue;
+        }
+      }
+    }
+
+    if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
+
+      unsigned EltSize = DL.getTypeSizeInBits(
+          cast<VectorType>(Shuffle->getType())->getElementType());
+
+      // For i32 (or greater) shufflevectors, these will be lowered into a
+      // series of insert / extract elements, which will be coalesced away.
+      if (EltSize < 16 || !ST->has16BitInsts())
+        continue;
+
+      int NumSubElts, SubIndex;
+      if (Shuffle->changesLength()) {
+        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
+          Ops.push_back(&Op);
+          continue;
+        }
+
+        if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
+             Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
+            !(SubIndex & 0x1)) {
+          Ops.push_back(&Op);
+          continue;
+        }
+      }
+
+      if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
+          Shuffle->isSingleSource()) {
+        Ops.push_back(&Op);
+        continue;
+      }
+    }
   }
 
   return !Ops.empty();
@@ -1413,7 +1507,8 @@ static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
     if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
       continue;
 
-    AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
+    if (auto Size = AI->getAllocationSize(DL))
+      AllocaSize += Size->getFixedValue();
   }
   return AllocaSize;
 }
@@ -1467,10 +1562,13 @@ unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
     Threshold += Threshold / 2;
   }
 
-  auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+  auto ArgAllocaSize = AI->getAllocationSize(DL);
+  if (!ArgAllocaSize)
+    return 0;
 
   // Attribute the bonus proportionally to the alloca size
-  unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
+  unsigned AllocaThresholdBonus =
+      (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
 
   return AllocaThresholdBonus;
 }
@@ -1574,3 +1672,14 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
   }
   return BaseT::getNumberOfParts(Tp);
 }
+
+InstructionUniformity
+GCNTTIImpl::getInstructionUniformity(const Value *V) const {
+  if (isAlwaysUniform(V))
+    return InstructionUniformity::AlwaysUniform;
+
+  if (isSourceOfDivergence(V))
+    return InstructionUniformity::NeverUniform;
+
+  return InstructionUniformity::Default;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 20da834..3ec157aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -101,6 +101,14 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
 
+  /// \returns true if V might be divergent even when all of its operands
+  /// are uniform.
+  bool isSourceOfDivergence(const Value *V) const;
+
+  /// Returns true for the target specific set of operations which produce
+  /// uniform result even taking non-uniform arguments.
+  bool isAlwaysUniform(const Value *V) const;
+
 public:
   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
 
@@ -168,14 +176,13 @@ public:
                                      ArrayRef<unsigned> Indices = {}) const;
 
   using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
-                                     const Value *Op1) const override;
+  InstructionCost
+  getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind,
+                     unsigned Index, const Value *Op0, const Value *Op1,
+                     TTI::VectorInstrContext VIC =
+                         TTI::VectorInstrContext::None) const override;
 
   bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
-  bool isSourceOfDivergence(const Value *V) const override;
-  bool isAlwaysUniform(const Value *V) const override;
 
   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
     // Address space casts must cast between different address spaces.
@@ -302,6 +309,8 @@ public:
   /// together under a single i32 value. Otherwise fall back to base
   /// implementation.
   unsigned getNumberOfParts(Type *Tp) const override;
+
+  InstructionUniformity getInstructionUniformity(const Value *V) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 50c78d8..864d877 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -16,12 +16,6 @@
 /// uniformity. And every instruction that's downstream and cares about dynamic
 /// uniformity must be convergent (and isel will introduce v_readfirstlane for
 /// them if their operands can't be proven statically uniform).
-///
-/// This pass is implemented as a ModulePass because intrinsic declarations
-/// exist at the module scope, allowing us to skip processing entirely if no
-/// declarations are present and to traverse their user lists directly when
-/// they are. A FunctionPass would instead require scanning every instruction
-/// in every function to find relevant intrinsics, which is far less efficient.
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -63,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
                                      const UniformityInfo &UI,
                                      ValueMap<const Value *, bool> &Tracker) {
   llvm::Intrinsic::ID IID = II.getIntrinsicID();
-
+  /// We deliberately do not simplify readfirstlane with a uniform argument, so
+  /// that frontends can use it to force a copy to SGPR and thereby prevent the
+  /// backend from generating unwanted waterfall loops.
   switch (IID) {
   case Intrinsic::amdgcn_permlane64:
-  case Intrinsic::amdgcn_readfirstlane:
   case Intrinsic::amdgcn_readlane: {
     Value *Src = II.getArgOperand(0);
     if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
@@ -97,14 +92,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
           Tracker[NotOp] = true; // NOT preserves uniformity
           LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
           ICmp->replaceAllUsesWith(NotOp);
-          ICmp->eraseFromParent();
           Changed = true;
         } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
           // Case: (icmp ne %ballot, 0) -> %ballot_arg
           LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
                             << *Src << '\n');
           ICmp->replaceAllUsesWith(Src);
-          ICmp->eraseFromParent();
           Changed = true;
         }
       }
@@ -114,46 +107,95 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
       II.eraseFromParent();
     return Changed;
   }
+  case Intrinsic::amdgcn_wave_shuffle: {
+    Use &Val = II.getOperandUse(0);
+    Use &Idx = II.getOperandUse(1);
+
+    // Like with readlane, if Value is uniform then just propagate it
+    if (!isDivergentUseWithNew(Val, UI, Tracker)) {
+      II.replaceAllUsesWith(Val);
+      II.eraseFromParent();
+      return true;
+    }
+
+    // Otherwise, when Index is uniform, this is just a readlane operation
+    if (isDivergentUseWithNew(Idx, UI, Tracker))
+      return false;
+
+    // The readlane intrinsic we want to call has the exact same function
+    // signature, so we can quickly modify the instruction in-place
+    Module *Mod = II.getModule();
+    II.setCalledFunction(Intrinsic::getOrInsertDeclaration(
+        Mod, Intrinsic::amdgcn_readlane, II.getType()));
+    return true;
+  }
   default:
-    llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+    return false;
   }
   return false;
 }
 
-/// Iterates over intrinsic declarations in the module to optimize their uses.
-static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+/// Iterates over intrinsic calls in the Function to optimize.
+static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
   bool IsChanged = false;
   ValueMap<const Value *, bool> Tracker;
 
-  FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  for (Function &F : M) {
-    switch (F.getIntrinsicID()) {
-    case Intrinsic::amdgcn_permlane64:
-    case Intrinsic::amdgcn_readfirstlane:
-    case Intrinsic::amdgcn_readlane:
-    case Intrinsic::amdgcn_ballot:
-      break;
-    default:
+  for (Instruction &I : make_early_inc_range(instructions(F))) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II)
       continue;
-    }
-
-    for (User *U : make_early_inc_range(F.users())) {
-      auto *II = cast<IntrinsicInst>(U);
-      Function *ParentF = II->getFunction();
-      const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
-      IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
-    }
+    IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
   }
   return IsChanged;
 }
 
 PreservedAnalyses
-AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
-  if (!runUniformIntrinsicCombine(M, AM))
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
+  if (!runUniformIntrinsicCombine(F, UI))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserve<UniformityInfoAnalysis>();
   return PA;
 }
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
+public:
+  static char ID;
+  AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {}
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<UniformityInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+  }
+};
+} // namespace
+
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+    AMDGPUUniformIntrinsicCombineLegacy::ID;
+
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  const UniformityInfo &UI =
+      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  return runUniformIntrinsicCombine(F, UI);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+                      "AMDGPU Uniform Intrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+                    "AMDGPU Uniform Intrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+  return new AMDGPUUniformIntrinsicCombineLegacy();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 733c5d5..fe81a5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
   return NewRetBlock;
 }
 
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+                       SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+  BasicBlock *DummyReturnBB =
+      BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+  Type *RetTy = F.getReturnType();
+  Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+  ReturningBlocks.push_back(DummyReturnBB);
+  return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+                          BasicBlock *DummyReturnBB,
+                          std::vector<DominatorTree::UpdateType> &Updates) {
+  SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+  // Create a new transition block to hold the conditional branch.
+  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+  Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+  // 'Successors' become successors of TransitionBB instead of BB,
+  // and TransitionBB becomes a single successor of BB.
+  Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+  for (BasicBlock *Successor : Successors) {
+    Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+    Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+  }
+
+  // Create a branch that will always branch to the transition block and
+  // references DummyReturnBB.
+  BB->getTerminator()->eraseFromParent();
+  BranchInst::Create(TransitionBB, DummyReturnBB,
+                     ConstantInt::getTrue(F.getContext()), BB);
+  Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
 bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
                                             const PostDominatorTree &PDT,
                                             const UniformityInfo &UA) {
-  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
   if (PDT.root_size() == 0 ||
       (PDT.root_size() == 1 &&
-       !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+       !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
     return false;
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
       if (HasDivergentExitBlock)
         UnreachableBlocks.push_back(BB);
     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
-      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
-      if (DummyReturnBB == nullptr) {
-        DummyReturnBB = BasicBlock::Create(F.getContext(),
-                                           "DummyReturnBlock", &F);
-        Type *RetTy = F.getReturnType();
-        Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
-        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
-        ReturningBlocks.push_back(DummyReturnBB);
-      }
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
 
       if (BI->isUnconditional()) {
         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
         BI->eraseFromParent(); // Delete the unconditional branch.
         // Add a new conditional branch with a dummy edge to the return block.
-        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
-        Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-      } else { // Conditional branch.
-        SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
-        // Create a new transition block to hold the conditional branch.
-        BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
-        Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
-        // 'Successors' become successors of TransitionBB instead of BB,
-        // and TransitionBB becomes a single successor of BB.
-        Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
-        for (BasicBlock *Successor : Successors) {
-          Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
-          Updates.emplace_back(DominatorTree::Delete, BB, Successor);
-        }
-
-        // Create a branch that will always branch to the transition block and
-        // references DummyReturnBB.
-        BB->getTerminator()->eraseFromParent();
-        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+                           ConstantInt::getTrue(F.getContext()), BB);
         Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+      } else {
+        handleNBranch(F, BB, BI, DummyReturnBB, Updates);
       }
       Changed = true;
+    } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+      handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+      Changed = true;
+    } else {
+      llvm_unreachable("unsupported block terminator");
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index 61c5dcd..faef408 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/TargetParser/TargetParser.h"
 
 using namespace llvm;
 
@@ -44,6 +45,7 @@ namespace {
 
 class AMDGPUWaitSGPRHazards {
 public:
+  const GCNSubtarget *ST;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
@@ -54,7 +56,7 @@ public:
   bool CullSGPRHazardsAtMemWait;
   unsigned CullSGPRHazardsMemWaitThreshold;
 
-  AMDGPUWaitSGPRHazards() {}
+  AMDGPUWaitSGPRHazards() = default;
 
   // Return the numeric ID 0-127 for a given SGPR.
   static std::optional<unsigned> sgprNumber(Register Reg,
@@ -165,7 +167,7 @@ public:
   }
 
   unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
-    unsigned Mask = 0xffff;
+    unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
     Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
                        AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
@@ -181,9 +183,12 @@ public:
     Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
                        AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
+    const AMDGPU::IsaVersion &Version = AMDGPU::getIsaVersion(ST->getCPU());
     Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
-        Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
-                       AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
+        Mask,
+        std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1, Version),
+                 AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2, Version)),
+        Version);
     Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
         Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
                        AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
@@ -387,7 +392,7 @@ public:
 
       // Apply wait
       if (Wait) {
-        unsigned Mask = 0xffff;
+        unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
         if (Wait & WA_VCC) {
           State.VCCHazard &= ~HazardState::VALU;
           Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
@@ -438,8 +443,8 @@ public:
   }
 
   bool run(MachineFunction &MF) {
-    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-    if (!ST.hasVALUReadSGPRHazard())
+    ST = &MF.getSubtarget<GCNSubtarget>();
+    if (!ST->hasVALUReadSGPRHazard())
       return false;
 
     // Parse settings
@@ -467,10 +472,10 @@ public:
     if (!EnableSGPRHazardWaits)
       return false;
 
-    TII = ST.getInstrInfo();
-    TRI = ST.getRegisterInfo();
+    TII = ST->getInstrInfo();
+    TRI = ST->getRegisterInfo();
     MRI = &MF.getRegInfo();
-    DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
+    DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
 
     auto CallingConv = MF.getFunction().getCallingConv();
     if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
@@ -555,6 +560,6 @@ PreservedAnalyses
 AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
                                MachineFunctionAnalysisManager &MFAM) {
   if (AMDGPUWaitSGPRHazards().run(MF))
-    return PreservedAnalyses::none();
+    return getMachineFunctionPassPreservedAnalyses();
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 99ba043..998a9d0 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -126,6 +126,7 @@ public:
     ImmTySMEMOffsetMod,
     ImmTyCPol,
     ImmTyTFE,
+    ImmTyIsAsync,
     ImmTyD16,
     ImmTyClamp,
     ImmTyOModSI,
@@ -143,10 +144,13 @@ public:
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
+    ImmTyDone,
+    ImmTyRowEn,
     ImmTyFORMAT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
+    ImmTyWaitEvent,
     ImmTyInterpSlot,
     ImmTyInterpAttr,
     ImmTyInterpAttrChan,
@@ -347,6 +351,11 @@ public:
     return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
   }
 
+  bool isAV_LdSt_32_Align2_RegOp() const {
+    return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+           isRegClass(AMDGPU::AGPR_32RegClassID);
+  }
+
   bool isVRegWithInputMods() const;
   template <bool IsFake16> bool isT16_Lo128VRegWithInputMods() const;
   template <bool IsFake16> bool isT16VRegWithInputMods() const;
@@ -408,6 +417,8 @@ public:
   bool isNegLo() const { return isImmTy(ImmTyNegLo); }
   bool isNegHi() const { return isImmTy(ImmTyNegHi); }
   bool isBitOp3() const { return isImmTy(ImmTyBitOp3) && isUInt<8>(getImm()); }
+  bool isDone() const { return isImmTy(ImmTyDone); }
+  bool isRowEn() const { return isImmTy(ImmTyRowEn); }
 
   bool isRegOrImm() const {
     return isReg() || isImm();
@@ -661,6 +672,8 @@ public:
 
   bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); }
 
+  bool isVSrc_v2f16_splat() const { return isVSrc_v2f16(); }
+
   bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); }
 
   bool isVISrcB32() const {
@@ -956,6 +969,7 @@ public:
   bool isSDelayALU() const;
   bool isHwreg() const;
   bool isSendMsg() const;
+  bool isWaitEvent() const;
   bool isSplitBarrier() const;
   bool isSwizzle() const;
   bool isSMRDOffset8() const;
@@ -1108,6 +1122,7 @@ public:
     case ImmTyIndexKey16bit: OS << "index_key"; break;
     case ImmTyIndexKey32bit: OS << "index_key"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyIsAsync: OS << "IsAsync"; break;
     case ImmTyD16: OS << "D16"; break;
     case ImmTyFORMAT: OS << "FORMAT"; break;
     case ImmTyClamp: OS << "Clamp"; break;
@@ -1133,8 +1148,11 @@ public:
     case ImmTyExpTgt: OS << "ExpTgt"; break;
     case ImmTyExpCompr: OS << "ExpCompr"; break;
     case ImmTyExpVM: OS << "ExpVM"; break;
+    case ImmTyDone: OS << "Done"; break;
+    case ImmTyRowEn: OS << "RowEn"; break;
     case ImmTyHwreg: OS << "Hwreg"; break;
     case ImmTySendMsg: OS << "SendMsg"; break;
+    case ImmTyWaitEvent: OS << "WaitEvent"; break;
     case ImmTyInterpSlot: OS << "InterpSlot"; break;
     case ImmTyInterpAttr: OS << "InterpAttr"; break;
     case ImmTyInterpAttrChan: OS << "InterpAttrChan"; break;
@@ -1544,6 +1562,12 @@ public:
 
   bool isGFX1250() const { return AMDGPU::isGFX1250(getSTI()); }
 
+  bool isGFX1250Plus() const { return AMDGPU::isGFX1250Plus(getSTI()); }
+
+  bool isGFX13() const { return AMDGPU::isGFX13(getSTI()); }
+
+  bool isGFX13Plus() const { return AMDGPU::isGFX13Plus(getSTI()); }
+
   bool isGFX10_AEncoding() const { return AMDGPU::isGFX10_AEncoding(getSTI()); }
 
   bool isGFX10_BEncoding() const {
@@ -1675,7 +1699,8 @@ public:
 
   ParseStatus
   parseNamedBit(StringRef Name, OperandVector &Operands,
-                AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+                AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+                bool IgnoreNegative = false);
   unsigned getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const;
   ParseStatus parseCPol(OperandVector &Operands);
   ParseStatus parseScope(OperandVector &Operands, int64_t &Scope);
@@ -1763,7 +1788,7 @@ private:
     bool IsSymbolic = false;
     bool IsDefined = false;
 
-    OperandInfoTy(int64_t Val) : Val(Val) {}
+    constexpr OperandInfoTy(int64_t Val) : Val(Val) {}
   };
 
   struct StructuredOpField : OperandInfoTy {
@@ -1772,8 +1797,8 @@ private:
     unsigned Width;
     bool IsDefined = false;
 
-    StructuredOpField(StringLiteral Id, StringLiteral Desc, unsigned Width,
-                      int64_t Default)
+    constexpr StructuredOpField(StringLiteral Id, StringLiteral Desc,
+                                unsigned Width, int64_t Default)
         : OperandInfoTy(Default), Id(Id), Desc(Desc), Width(Width) {}
     virtual ~StructuredOpField() = default;
 
@@ -1860,13 +1885,12 @@ private:
   bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
                               const unsigned CPol);
   bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
-  bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
   bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands);
   bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
-  unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+  MCRegister findImplicitSGPRReadInVOP(const MCInst &Inst) const;
 
   bool isSupportedMnemo(StringRef Mnemo,
                         const FeatureBitset &FBS);
@@ -1905,6 +1929,7 @@ public:
 
   ParseStatus parseExpTgt(OperandVector &Operands);
   ParseStatus parseSendMsg(OperandVector &Operands);
+  ParseStatus parseWaitEvent(OperandVector &Operands);
   ParseStatus parseInterpSlot(OperandVector &Operands);
   ParseStatus parseInterpAttr(OperandVector &Operands);
   ParseStatus parseSOPPBrTarget(OperandVector &Operands);
@@ -2040,6 +2065,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
   case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
   case AMDGPU::OPERAND_KIMM16:
     return &APFloat::IEEEhalf();
@@ -2434,6 +2460,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
     case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
     case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
     case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
     case AMDGPU::OPERAND_REG_IMM_V2FP32:
     case AMDGPU::OPERAND_REG_IMM_V2INT32:
@@ -2476,6 +2503,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_IMM_V2BF16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
   case AMDGPU::OPERAND_REG_IMM_V2FP32:
   case AMDGPU::OPERAND_REG_IMM_V2INT32:
   case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
@@ -2922,7 +2950,7 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
     return AMDGPU::NoRegister;
   }
 
-  if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) {
+  if (RegKind == IS_VGPR && !isGFX1250Plus() && RegIdx + RegWidth / 32 > 256) {
     Error(Loc, "register index is out of range");
     return MCRegister();
   }
@@ -3666,7 +3694,8 @@ StringRef AMDGPUAsmParser::getMatchedVariantName() const {
   return "";
 }
 
-unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
+MCRegister
+AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
   for (MCPhysReg Reg : Desc.implicit_uses()) {
     switch (Reg) {
@@ -3680,7 +3709,7 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
       break;
     }
   }
-  return AMDGPU::NoRegister;
+  return MCRegister();
 }
 
 // NB: This code is correct only when used to check constant
@@ -3720,6 +3749,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
         OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
       return AMDGPU::isInlinableLiteralV2F16(Val);
 
+    if (OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT)
+      return AMDGPU::isPKFMACF16InlineConstant(Val, isGFX11Plus());
+
     if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2BF16 ||
         OperandType == AMDGPU::OPERAND_REG_IMM_V2BF16)
       return AMDGPU::isInlinableLiteralV2BF16(Val);
@@ -3855,9 +3887,9 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(
     LiteralSize = 4;
   }
 
-  SmallDenseSet<unsigned> SGPRsUsed;
-  unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
-  if (SGPRUsed != AMDGPU::NoRegister) {
+  SmallDenseSet<MCRegister> SGPRsUsed;
+  MCRegister SGPRUsed = findImplicitSGPRReadInVOP(Inst);
+  if (SGPRUsed) {
     SGPRsUsed.insert(SGPRUsed);
     ++ConstantBusUseCount;
   }
@@ -3940,7 +3972,7 @@ AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) {
   bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 ||
                  Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 ||
                  Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250;
-  bool AllowSameVGPR = isGFX1250();
+  bool AllowSameVGPR = isGFX1250Plus();
 
   if (AsVOPD3) { // Literal constants are not allowed with VOPD3.
     for (auto OpName : {OpName::src0X, OpName::src0Y}) {
@@ -4074,7 +4106,7 @@ bool AMDGPUAsmParser::tryVOPD(const MCInst &Inst) {
 // form but switch to VOPD3 otherwise.
 bool AMDGPUAsmParser::tryAnotherVOPDEncoding(const MCInst &Inst) {
   const unsigned Opcode = Inst.getOpcode();
-  if (!isGFX1250() || !isVOPD(Opcode))
+  if (!isGFX1250Plus() || !isVOPD(Opcode))
     return false;
 
   if (MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3)
@@ -5364,7 +5396,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
 
   unsigned CPol = Inst.getOperand(CPolPos).getImm();
 
-  if (!isGFX1250()) {
+  if (!isGFX1250Plus()) {
     if (CPol & CPol::SCAL) {
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
       StringRef CStr(S.getPointer());
@@ -5506,22 +5538,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
   return true;
 }
 
-bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst,
-                                         const OperandVector &Operands) {
-  if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12)
-    return true;
-
-  int Simm16Pos =
-      AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16);
-  if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) {
-    SMLoc Loc = Operands[1]->getStartLoc();
-    Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]");
-    return false;
-  }
-
-  return true;
-}
-
 bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
                                    const OperandVector &Operands) {
   unsigned Opc = Inst.getOpcode();
@@ -5541,12 +5557,9 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
     if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
       return true;
 
-    static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
-                                     "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
-                                     "MATRIX_FMT_FP4"};
-
     Error(getOperandLoc(Operands, SrcIdx),
-          "wrong register tuple size for " + Twine(FmtNames[Fmt]));
+          "wrong register tuple size for " +
+              Twine(WMMAMods::ModMatrixFmt[Fmt]));
     return false;
   };
 
@@ -5681,9 +5694,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc,
   if (!validateTFE(Inst, Operands)) {
     return false;
   }
-  if (!validateSetVgprMSB(Inst, Operands)) {
-    return false;
-  }
   if (!validateWMMA(Inst, Operands)) {
     return false;
   }
@@ -6182,7 +6192,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
         return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
       AccumOffset = ExprVal;
     } else if (ID == ".amdhsa_named_barrier_count") {
-      if (!isGFX1250())
+      if (!isGFX1250Plus())
         return Error(IDRange.Start, "directive requires gfx1250+", IDRange);
       NamedBarCnt = ExprVal;
     } else if (ID == ".amdhsa_reserve_vcc") {
@@ -6382,7 +6392,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
     return TokError("amdgpu_user_sgpr_count smaller than than implied by "
                     "enabled user SGPRs");
 
-  if (isGFX1250()) {
+  if (isGFX1250Plus()) {
     if (!isUInt<COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
       return TokError("too many user SGPRs enabled");
     AMDGPU::MCKernelDescriptor::bits_set(
@@ -6437,7 +6447,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                                  getContext());
   }
 
-  if (isGFX1250())
+  if (isGFX1250Plus())
     MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, NamedBarCnt,
                                  COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT,
                                  COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
@@ -7046,13 +7056,16 @@ ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix(
 
 ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name,
                                            OperandVector &Operands,
-                                           AMDGPUOperand::ImmTy ImmTy) {
+                                           AMDGPUOperand::ImmTy ImmTy,
+                                           bool IgnoreNegative) {
   int64_t Bit;
   SMLoc S = getLoc();
 
   if (trySkipId(Name)) {
     Bit = 1;
   } else if (trySkipId("no", Name)) {
+    if (IgnoreNegative)
+      return ParseStatus::Success;
     Bit = 0;
   } else {
     return ParseStatus::NoMatch;
@@ -7063,6 +7076,12 @@ ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name,
   if (Name == "a16" && !hasA16())
     return Error(S, "a16 modifier is not supported on this GPU");
 
+  if (Bit == 0 && Name == "gds") {
+    StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
+    if (Mnemo.starts_with("ds_gws"))
+      return Error(S, "nogds is not allowed");
+  }
+
   if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
     ImmTy = AMDGPUOperand::ImmTyR128A16;
 
@@ -7403,10 +7422,7 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
 ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands,
                                                StringRef Name,
                                                AMDGPUOperand::ImmTy Type) {
-  return parseStringOrIntWithPrefix(Operands, Name,
-                                    {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
-                                     "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
-                                     "MATRIX_FMT_FP4"},
+  return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixFmt,
                                     Type);
 }
 
@@ -7423,8 +7439,8 @@ ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
 ParseStatus AMDGPUAsmParser::tryParseMatrixScale(OperandVector &Operands,
                                                  StringRef Name,
                                                  AMDGPUOperand::ImmTy Type) {
-  return parseStringOrIntWithPrefix(
-      Operands, Name, {"MATRIX_SCALE_ROW0", "MATRIX_SCALE_ROW1"}, Type);
+  return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixScale,
+                                    Type);
 }
 
 ParseStatus AMDGPUAsmParser::parseMatrixAScale(OperandVector &Operands) {
@@ -7440,10 +7456,8 @@ ParseStatus AMDGPUAsmParser::parseMatrixBScale(OperandVector &Operands) {
 ParseStatus AMDGPUAsmParser::tryParseMatrixScaleFmt(OperandVector &Operands,
                                                     StringRef Name,
                                                     AMDGPUOperand::ImmTy Type) {
-  return parseStringOrIntWithPrefix(
-      Operands, Name,
-      {"MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"},
-      Type);
+  return parseStringOrIntWithPrefix(Operands, Name, WMMAMods::ModMatrixScaleFmt,
+                                    Type);
 }
 
 ParseStatus AMDGPUAsmParser::parseMatrixAScaleFmt(OperandVector &Operands) {
@@ -8241,6 +8255,41 @@ bool AMDGPUOperand::isSendMsg() const {
   return isImmTy(ImmTySendMsg);
 }
 
+ParseStatus AMDGPUAsmParser::parseWaitEvent(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::WaitEvent;
+
+  SMLoc Loc = getLoc();
+  int64_t ImmVal = 0;
+
+  StructuredOpField DontWaitExportReady("dont_wait_export_ready", "bit value",
+                                        1, 0);
+  StructuredOpField ExportReady("export_ready", "bit value", 1, 0);
+
+  StructuredOpField *TargetBitfield =
+      isGFX11() ? &DontWaitExportReady : &ExportReady;
+
+  ParseStatus Res = parseStructuredOpFields({TargetBitfield});
+  if (Res.isNoMatch() && parseExpr(ImmVal, "structured immediate"))
+    Res = ParseStatus::Success;
+  else if (Res.isSuccess()) {
+    if (!validateStructuredOpFields({TargetBitfield}))
+      return ParseStatus::Failure;
+    ImmVal = TargetBitfield->Val;
+  }
+
+  if (!Res.isSuccess())
+    return ParseStatus::Failure;
+
+  if (!isUInt<16>(ImmVal))
+    return Error(Loc, "invalid immediate: only 16-bit values are legal");
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc,
+                                              AMDGPUOperand::ImmTyWaitEvent));
+  return ParseStatus::Success;
+}
+
+bool AMDGPUOperand::isWaitEvent() const { return isImmTy(ImmTyWaitEvent); }
+
 //===----------------------------------------------------------------------===//
 // v_interp
 //===----------------------------------------------------------------------===//
@@ -9048,6 +9097,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
+  // Parse a dummy operand as a placeholder for the SWZ operand. This enforces
+  // agreement between MCInstrDesc.getNumOperands and MCInst.getNumOperands.
+  Inst.addOperand(MCOperand::createImm(0));
 }
 
 //===----------------------------------------------------------------------===//
@@ -9514,6 +9566,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
       Opc == AMDGPU::V_CVT_SCALEF32_PK_FP4_BF16_vi ||
       Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
       Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
+      Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx11 ||
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx11 ||
       Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
       Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
     Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
@@ -9523,7 +9577,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
   // Adding vdst_in operand is already covered for these DPP instructions in
   // cvtVOP3DPP.
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
-      !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
+      !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx11 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx11 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx11 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx11 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx11 ||
+        Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
         Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 ||
         Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 ||
         Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 ||
@@ -10439,7 +10505,7 @@ ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands,
   case MCK_addr64:
     return parseTokenOp("addr64", Operands);
   case MCK_done:
-    return parseTokenOp("done", Operands);
+    return parseNamedBit("done", Operands, AMDGPUOperand::ImmTyDone, true);
   case MCK_idxen:
     return parseTokenOp("idxen", Operands);
   case MCK_lds:
@@ -10449,7 +10515,7 @@ ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands,
   case MCK_off:
     return parseTokenOp("off", Operands);
   case MCK_row_95_en:
-    return parseTokenOp("row_en", Operands);
+    return parseNamedBit("row_en", Operands, AMDGPUOperand::ImmTyRowEn, true);
   case MCK_gds:
     return parseNamedBit("gds", Operands, AMDGPUOperand::ImmTyGDS);
   case MCK_tfe:
@@ -10480,6 +10546,10 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
     return Operand.isOffen() ? Match_Success : Match_InvalidOperand;
   case MCK_tfe:
     return Operand.isTFE() ? Match_Success : Match_InvalidOperand;
+  case MCK_done:
+    return Operand.isDone() ? Match_Success : Match_InvalidOperand;
+  case MCK_row_95_en:
+    return Operand.isRowEn() ? Match_Success : Match_InvalidOperand;
   case MCK_SSrc_b32:
     // When operands have expression values, they will return true for isToken,
     // because it is not possible to distinguish between a token and an
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b97b738..568fff2 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -159,9 +159,9 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
 }
 
 class getMTBUFInsDA<list<RegisterOperand> vdataList,
-                    list<RegisterClass> vaddrList=[], bit hasRestrictedSOffset> {
+                    list<RegisterOperand> vaddrList=[], bit hasRestrictedSOffset> {
   RegisterOperand vdata_op = !if(!empty(vdataList), ?, !head(vdataList));
-  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList));
 
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset),
                                  (ins SCSrc_b32:$soffset));
@@ -171,7 +171,7 @@ class getMTBUFInsDA<list<RegisterOperand> vdataList,
 
   dag Inputs = !if(!empty(vaddrList),
                    NonVaddrInputs,
-                   !con((ins vaddrClass:$vaddr), NonVaddrInputs));
+                   !con((ins vaddr_op:$vaddr), NonVaddrInputs));
   dag ret = !if(!empty(vdataList),
                 Inputs,
                 !con((ins vdata_op:$vdata), Inputs));
@@ -180,10 +180,10 @@ class getMTBUFInsDA<list<RegisterOperand> vdataList,
 class getMTBUFIns<int addrKind, list<RegisterOperand> vdataList=[], bit hasRestrictedSOffset> {
   dag ret =
     !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPR_32], hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64], hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64], hasRestrictedSOffset>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPROp_32], hasRestrictedSOffset>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPROp_32], hasRestrictedSOffset>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VGPROp_64], hasRestrictedSOffset>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VGPROp_64], hasRestrictedSOffset>.ret,
     (ins))))));
 }
 
@@ -393,7 +393,7 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
   let sccb_value  = 0;
 }
 
-class getBUFVDataRegisterOperand<int Size, bit isTFE> {
+class getBUFVDataRegisterOperand<int Size, bit isTFE, bit isTrue16 = false> {
   defvar tfeVDataOp =
     !cond(!eq(Size, 16)  : AVLdSt_64,
           !eq(Size, 32)  : AVLdSt_64,
@@ -402,7 +402,7 @@ class getBUFVDataRegisterOperand<int Size, bit isTFE> {
           !eq(Size, 128) : AVLdSt_160);
 
   defvar VDataOp =
-    !cond(!eq(Size, 16)   : AVLdSt_32,
+    !cond(!eq(Size, 16)   : !if(isTrue16, VGPROp_16, AVLdSt_32),
           !eq(Size, 32)   : AVLdSt_32,
           !eq(Size, 64)   : AVLdSt_64,
           !eq(Size, 96)   : AVLdSt_96,
@@ -417,15 +417,17 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> {
 }
 
 class getMUBUFInsDA<list<RegisterOperand> vdataList,
-                    list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
+                    list<RegisterOperand> vaddrList, bit isTFE, bit hasRestrictedSOffset,
+                    bit isTrue16, bit isLds> {
   RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
-  RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
-  RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret;
+  RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList));
+  RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE, isTrue16>.ret;
 
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
-  dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
+  dag IsAsyncOpnd = !if(isLds, (ins i1imm_0:$IsAsync), (ins));
+  dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz), IsAsyncOpnd);
 
-  dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
+  dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddr_op:$vaddr), NonVaddrInputs));
   dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
 }
 
@@ -448,13 +450,14 @@ class getMUBUFElements<ValueType vt> {
     );
 }
 
-class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> {
+class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE,
+                  bit hasRestrictedSOffset, bit isTrue16, bit isLds> {
   dag ret =
-    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPROp_32], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPROp_32], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VGPROp_64], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VGPROp_64], isTFE, hasRestrictedSOffset, isTrue16, isLds>.ret,
     (ins))))));
 }
 
@@ -499,7 +502,7 @@ class MUBUF_Load_Pseudo <string opName,
                          RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdata_vt.Size, isTFE>.ret>
   : MUBUF_Pseudo<opName,
                  !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
-                 !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset>.ret,
+                 !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset, 0, isLds>.ret,
                       !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
                  getMUBUFAsmOps<addrKind, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
                  pattern>,
@@ -509,7 +512,7 @@ class MUBUF_Load_Pseudo <string opName,
   let AsmMatchConverter = "cvtMubuf";
 
   let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
-  let LGKM_CNT = isLds;
+  let LGKM_CNT = 0;
   let has_vdata = !not(!or(isLds, isLdsOpc));
   let mayLoad = 1;
   let mayStore = isLds;
@@ -566,6 +569,33 @@ multiclass MUBUF_Pseudo_Loads_Helper<string opName, ValueType load_vt,
   }
 }
 
+multiclass MUBUF_Pseudo_Loads_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType load_vt,
+                                     bit TiedDest, bit isLds, bit isTFE, bit hasRestrictedSOffset> {
+  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">;
+
+  def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">;
+
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">;
+    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">;
+    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">;
+    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">;
+  }
+}
+
 multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
                               bit TiedDest = 0, bit isLds = 0> {
   defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>;
@@ -577,6 +607,23 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
   }
 }
 
+multiclass MUBUF_Pseudo_Loads_t16<string opName, ValueType load_vt = i32,
+                                  bit TiedDest = 0, bit isLds = 0, string hiOpName = NAME#"_HI"> {
+  let True16Predicate = NotUseRealTrue16Insts in {
+    defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>;
+    defm _VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 1>;
+  }
+  let True16Predicate = UseRealTrue16Insts in {
+    defvar NAME16 = opName#"_t16";
+    defm _t16 : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName, NAME, i16, 0, isLds, 0, 0>;
+    defm _t16_VBUFFER : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName#"_VBUFFER", NAME#"_VBUFFER", i16, 0, isLds, 0, 1>;
+  }
+  if !not(isLds) then {
+    defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 0>;
+    defm _TFE_VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 1>;
+  }
+}
+
 multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> {
   defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>;
 
@@ -595,10 +642,11 @@ class MUBUF_Store_Pseudo <string opName,
                           ValueType store_vt,
                           bit isTFE = 0,
                           bit hasRestrictedSOffset = 0,
-                          list<dag> pattern=[]>
+                          list<dag> pattern=[],
+                          bit isTrue16 = false>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 getMUBUFIns<addrKind, [getVregSrcForVT<store_vt>.ret], isTFE, hasRestrictedSOffset>.ret,
+                 getMUBUFIns<addrKind, [getVregSrcForVT<store_vt, isTrue16, 0>.ret], isTFE, hasRestrictedSOffset, isTrue16, 0>.ret,
                  getMUBUFAsmOps<addrKind, 0, 0, isTFE>.ret,
                  pattern>,
     MUBUF_SetupAddr<addrKind> {
@@ -650,6 +698,33 @@ multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt,
   }
 }
 
+multiclass MUBUF_Pseudo_Stores_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType store_vt,
+                                          bit isTFE, bit hasRestrictedSOffset> {
+  def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    MUBUFAddr64Table<0, NAME>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">;
+
+  def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    MUBUFAddr64Table<1, NAME>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">;
+
+  def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">;
+  def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">;
+  def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">;
+    def _OFFEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">;
+    def _IDXEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">;
+    def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">;
+  }
+}
+
 multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
   defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>;
   defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>;
@@ -658,6 +733,22 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
   defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>;
 }
 
+multiclass MUBUF_Pseudo_Stores_t16<string opName, ValueType store_vt = i32> {
+  defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>;
+  defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>;
+
+  let True16Predicate = NotUseRealTrue16Insts in {
+    defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>;
+
+    defm _VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 1>;
+  }
+  let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = HasD16LoadStore in {
+    defvar NAME16 = opName#"_t16";
+    defm _t16 : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI", NAME, i16, 0, 0>;
+    defm _t16_VBUFFER : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI_VBUFFER", NAME#"_VBUFFER", i16, 0, 1>;
+  }
+}
+
 class MUBUF_Pseudo_Store_Lds<string opName>
   : MUBUF_Pseudo<opName,
                  (outs),
@@ -677,11 +768,11 @@ class MUBUF_Pseudo_Store_Lds<string opName>
 }
 
 class getMUBUFAtomicInsDA<RegisterOperand vdata_op, bit vdata_in, bit hasRestrictedSOffset,
-                          list<RegisterClassLike> vaddrList=[]> {
-  RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+                          list<RegisterOperand> vaddrList=[]> {
+  RegisterOperand vaddr_op = !if(!empty(vaddrList), ?, !head(vaddrList));
 
   dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata));
-  dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr)));
+  dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddr_op:$vaddr)));
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
   dag MainInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset));
   dag CPol = !if(vdata_in, (ins CPol_GLC_WithDefault:$cpol),
@@ -698,13 +789,13 @@ class getMUBUFAtomicIns<int addrKind,
     !if(!eq(addrKind, BUFAddrKind.Offset),
             getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset>.ret,
     !if(!eq(addrKind, BUFAddrKind.OffEn),
-            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_32]>.ret,
     !if(!eq(addrKind, BUFAddrKind.IdxEn),
-            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPR_32]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_32]>.ret,
     !if(!eq(addrKind, BUFAddrKind.BothEn),
-            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_64]>.ret,
     !if(!eq(addrKind, BUFAddrKind.Addr64),
-            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VReg_64_AlignTarget]>.ret,
+            getMUBUFAtomicInsDA<vdataClass, vdata_in, hasRestrictedSOffset, [VGPROp_64]>.ret,
     (ins))))));
 }
 
@@ -783,37 +874,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
 
 multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
                                      RegisterOperand vdataClass,
-                                     ValueType vdataType,
-                                     SDPatternOperator atomic> {
+                                     ValueType vdataType> {
   let FPAtomic = vdataType.isFP in {
-    def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
-               vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <0, NAME # "_RTN">;
-
-    def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
-                vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <1, NAME # "_RTN">;
-
+    def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
+                      MUBUFAddr64Table <0, NAME # "_RTN">;
+    def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
+                      MUBUFAddr64Table <1, NAME # "_RTN">;
     def _OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass, 0>;
     def _IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass, 0>;
     def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
 
-    def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
-               vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
-
-    def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
-                vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
-
+    def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>,
+                      MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
+    def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>,
+                      MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
     def _VBUFFER_OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn,  vdataClass, 1>;
     def _VBUFFER_IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn,  vdataClass, 1>;
     def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
@@ -822,10 +896,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
 
 multiclass MUBUF_Pseudo_Atomics <string opName,
                                  RegisterOperand vdataClass,
-                                 ValueType vdataType,
-                                 SDPatternOperator atomic = null_frag> :
+                                 ValueType vdataType> :
   MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
-  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>;
 
 
 //===----------------------------------------------------------------------===//
@@ -889,10 +962,16 @@ let TiedSourceNotRead = 1 in {
   >;
 } // End OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1.
 
+
+let TiedSourceNotRead = 1, SubtargetPredicate = HasD16LoadStore, OtherPredicates = [HasFormattedMUBUFInsts] in
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
+  "buffer_load_format_d16_hi_x", i32
+>;
+
 let OtherPredicates = [HasPackedD16VMem], D16Buf = 1 in {
 let TiedSourceNotRead = 1 in {
-  defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_x", f16
+  defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads_t16 <
+    "buffer_load_format_d16_x", f16, 0, 0, "BUFFER_LOAD_FORMAT_D16_HI_X"
   >;
   defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
     "buffer_load_format_d16_xy", v2f16
@@ -948,9 +1027,6 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_aext_8_globa
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_zext_8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_aext_16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_zext_16_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
@@ -977,12 +1053,23 @@ foreach vt = VReg_128.RegTypes in {
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
 }
 
-defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
+let SubtargetPredicate = HasD16LoadStore in {
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
+  "buffer_store_byte_d16_hi", i32
+>;
+
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
+  "buffer_store_short_d16_hi", i32
+>;
+}
+
+defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores_t16 <
   "buffer_store_byte", i32
 >;
-defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
+defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores_t16 <
   "buffer_store_short", i32
 >;
+
 defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
   "buffer_store_dword", i32
 >;
@@ -1096,7 +1183,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
 
 let OtherPredicates = [HasGFX10_BEncoding] in {
   defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
-    "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
+    "buffer_atomic_csub", VGPROp_32, i32
   >;
 }
 
@@ -1117,65 +1204,52 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
 let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 
 defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
+  "buffer_atomic_fcmpswap", AVLdSt_64, v2f32
 >;
 }
 
 let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
 defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
+  "buffer_atomic_fmin", AVLdSt_32, f32
 >;
 defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
+  "buffer_atomic_fmax", AVLdSt_32, f32
 >;
 }
 
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
+  "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64
 >;
 }
 
 let SubtargetPredicate = HasD16LoadStore in {
 let TiedSourceNotRead = 1 in {
 
-defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_ubyte_d16", i32, 1
->;
-
 defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
   "buffer_load_ubyte_d16_hi", i32, 1
 >;
 
-defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_sbyte_d16", i32, 1
->;
-
 defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
   "buffer_load_sbyte_d16_hi", i32, 1
 >;
 
-defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_short_d16", i32, 1
->;
-
 defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
   "buffer_load_short_d16_hi", i32, 1
 >;
 
-let OtherPredicates = [HasFormattedMUBUFInsts] in
-defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
-  "buffer_load_format_d16_hi_x", i32
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads_t16 <
+  "buffer_load_ubyte_d16", i32, 1
 >;
-} // End TiedSourceNotRead
 
-defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
-  "buffer_store_byte_d16_hi", i32
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads_t16 <
+  "buffer_load_sbyte_d16", i32, 1
 >;
 
-defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
-  "buffer_store_short_d16_hi", i32
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads_t16 <
+  "buffer_load_short_d16", i32, 1
 >;
+} // End TiedSourceNotRead
 
 let OtherPredicates = [HasFormattedMUBUFInsts] in
 defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
@@ -1184,6 +1258,18 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
 
 } // End HasD16LoadStore
 
+let True16Predicate = NotUseRealTrue16Insts in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_aext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_zext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SHORT_D16_t16", i16, atomic_load_nonext_16_global>;
+}
+
 let SubtargetPredicate = isNotGFX940Plus in
 def BUFFER_WBINVL1 : MUBUF_Invalidate <
   "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
@@ -1201,12 +1287,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
 
 let SubtargetPredicate = HasAtomicFaddRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
-  "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
+  "buffer_atomic_add_f32", AVLdSt_32, f32
 >;
 
 let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
-  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
+  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
 >;
 
 let SubtargetPredicate = isGFX12Plus in {
@@ -1385,8 +1471,14 @@ let OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts] in {
 } // End OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts].
 
 let OtherPredicates = [HasPackedD16VMem, HasFormattedMUBUFInsts] in {
+let True16Predicate = NotUseRealTrue16Insts in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
+}
+let True16Predicate = UseRealTrue16Insts in {
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_t16">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_t16">;
+}
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
@@ -1952,15 +2044,26 @@ multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt,
 }
 
 let OtherPredicates = [Has16BitInsts] in {
-
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_constant>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_constant>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_constant>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_global>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_global>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_global>;
-
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_USHORT", i16, load_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SHORT_D16_t16", i16, load_global>;
+}
+
 
 } // End OtherPredicates = [Has16BitInsts]
 
@@ -2000,6 +2103,19 @@ multiclass MUBUFScratchLoadPat_D16_Common <string Instr,
   >;
 }
 
+multiclass MUBUFScratchLoadPat_D16_Common_t16 <string Instr, ValueType vt, PatFrag ld_frag> {
+  def : GCNPat <
+    (vt (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset))),
+    (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset)
+  >;
+
+  def : GCNPat <
+    (vt (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
+    (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset)
+  >;
+}
+
+
 multiclass MUBUFScratchLoadPat_D16 <string Instr,
                                 ValueType vt, PatFrag ld_frag> {
   let SubtargetPredicate = HasUnrestrictedSOffset in {
@@ -2008,17 +2124,35 @@ multiclass MUBUFScratchLoadPat_D16 <string Instr,
   defm : MUBUFScratchLoadPat_D16_Common<Instr # "_VBUFFER", vt, ld_frag>;
 }
 
-let OtherPredicates = [DisableFlatScratch] in {
+multiclass MUBUFScratchLoadPat_D16_t16 <string Instr,
+                                ValueType vt, PatFrag ld_frag> {
+  let SubtargetPredicate = HasUnrestrictedSOffset in {
+    defm : MUBUFScratchLoadPat_D16_Common_t16<Instr, vt, ld_frag>;
+  }
+  defm : MUBUFScratchLoadPat_D16_Common_t16<Instr # "_VBUFFER", vt, ld_frag>;
+}
+
+let OtherPredicates = [NotHasFlatScratchEnabled] in {
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i32, sextloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, extloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, zextloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SSHORT", i32, sextloadi16_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, extloadi16_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, zextloadi16_private>;
+
+let True16Predicate = NotUseRealTrue16Insts in {
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i16, load_private>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SHORT_D16_t16", i16, load_private>;
+}
 
 foreach vt = Reg32Types.types in {
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORD", vt, load_private>;
@@ -2027,7 +2161,7 @@ defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX2", v2i32, load_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX3", v3i32, load_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX4", v4i32, load_private>;
 
-let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
+let OtherPredicates = [D16PreservesUnusedBits, NotHasFlatScratchEnabled] in {
 defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16_HI", v2i16, load_d16_hi_private>;
 defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16_HI", v2i16, az_extloadi8_d16_hi_private>;
 defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16_HI", v2i16, sextloadi8_d16_hi_private>;
@@ -2043,7 +2177,7 @@ defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16", v2f16, az_extloadi8_d16_
 defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16", v2f16, sextloadi8_d16_lo_private>;
 }
 
-} // End OtherPredicates = [DisableFlatScratch]
+} // End OtherPredicates = [NotHasFlatScratchEnabled]
 
 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
                                       ValueType vt, PatFrag atomic_st> {
@@ -2084,8 +2218,15 @@ multiclass MUBUFStore_PatternOffset <string Instr, ValueType vt,
   defm : MUBUFStore_PatternOffset_Common<Instr # "_VBUFFER", vt, st>;
 }
 
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE", i16, truncstorei8_global>;
 defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_global>;
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT_t16", i16, store_global>;
+}
 
 multiclass MUBUFScratchStorePat_Common <string Instr,
                                  ValueType vt, PatFrag st,
@@ -2112,11 +2253,19 @@ multiclass MUBUFScratchStorePat <string Instr,
   defm : MUBUFScratchStorePat_Common<Instr # "_VBUFFER", vt, st, rc>;
 }
 
-let OtherPredicates = [DisableFlatScratch] in {
+let OtherPredicates = [NotHasFlatScratchEnabled] in {
 defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i32, truncstorei8_private>;
 defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i32, truncstorei16_private>;
+
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i16, truncstorei8_private>;
 defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i16, store_private>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_private, VGPR_16>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_t16", i16, store_private, VGPR_16>;
+}
 
 foreach vt = Reg32Types.types in {
 defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORD", vt, store_private>;
@@ -2127,7 +2276,7 @@ defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX3", v3i32, store_private, VReg_
 defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX4", v4i32, store_private, VReg_128>;
 
 
-let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, NotHasFlatScratchEnabled] in {
  // Hiding the extract high pattern in the PatFrag seems to not
  // automatically increase the complexity.
 let AddedComplexity = 1 in {
@@ -2135,7 +2284,7 @@ defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_D16_HI", i32, store_hi16_privat
 defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_D16_HI", i32, truncstorei8_hi16_private>;
 }
 }
-} // End OtherPredicates = [DisableFlatScratch]
+} // End OtherPredicates = [NotHasFlatScratchEnabled]
 
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 13f727b68..ae684a5 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering)
 tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables)
 tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
 
@@ -39,10 +40,6 @@ tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
-set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
-tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
-add_public_tablegen_target(InstCombineTableGen)
-
 add_llvm_target(AMDGPUCodeGen
   AMDGPUAliasAnalysis.cpp
   AMDGPUAlwaysInlinePass.cpp
@@ -52,6 +49,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAsmPrinter.cpp
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
+  AMDGPUBarrierLatency.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
@@ -61,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHazardLatency.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
@@ -80,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
   AMDGPUPrepareAGPRAlloc.cpp
+  AMDGPULowerExecSync.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d0ad120..d8a8450 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -128,7 +128,7 @@ class DS_0A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
 class DS_1A1D_NORET<string opName, RegisterOperand rc = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, rc:$data0, Offset:$offset, gds:$gds),
+  (ins VGPROp_32:$addr, rc:$data0, Offset:$offset, gds:$gds),
   " $addr, $data0$offset$gds"> {
 
   let has_data1 = 0;
@@ -163,7 +163,7 @@ multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterOperand rc = AVLdSt_32>
 class DS_1A2D_NORET<string opName, RegisterOperand data_op = VGPROp_32>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds),
+  (ins VGPROp_32:$addr, data_op:$data0, data_op:$data1, Offset:$offset, gds:$gds),
   " $addr, $data0, $data1$offset$gds"> {
 
   let has_vdst = 0;
@@ -190,7 +190,7 @@ multiclass DS_1A2D_NORET_mc<string opName, RegisterOperand rc = VGPROp_32> {
 class DS_1A2D_Off8_NORET <string opName, RegisterOperand data_op = VGPROp_32>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, data_op:$data0, data_op:$data1,
+  (ins VGPROp_32:$addr, data_op:$data0, data_op:$data1,
        Offset0:$offset0, Offset1:$offset1, gds:$gds),
   " $addr, $data0, $data1$offset0$offset1$gds"> {
 
@@ -230,7 +230,7 @@ class DS_0A1D_RET_GDS<string opName, RegisterOperand dst_op = AVLdSt_32,
 class DS_1A1D_RET <string opName, RegisterOperand data_op = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs data_op:$vdst),
-  (ins VGPR_32:$addr, data_op:$data0, Offset:$offset, gds:$gds),
+  (ins VGPROp_32:$addr, data_op:$data0, Offset:$offset, gds:$gds),
   " $vdst, $addr, $data0$offset$gds"> {
 
   let has_data1 = 0;
@@ -260,7 +260,7 @@ class DS_1A2D_RET<string opName,
                   RegisterOperand dst_rc = VGPROp_32,
                   RegisterOperand src_rc = dst_rc>: DS_Pseudo<opName,
   (outs dst_rc:$vdst),
-  (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds),
+  (ins VGPROp_32:$addr, src_rc:$data0, src_rc:$data1, Offset:$offset, gds:$gds),
   " $vdst, $addr, $data0, $data1$offset$gds"> {
 
   let IsAtomicRet = 1;
@@ -286,7 +286,7 @@ class DS_1A2D_Off8_RET<string opName,
                        RegisterOperand src_rc = dst_rc>
 : DS_Pseudo<opName,
   (outs dst_rc:$vdst),
-  (ins VGPR_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
+  (ins VGPROp_32:$addr, src_rc:$data0, src_rc:$data1, Offset0:$offset0, Offset1:$offset1, gds:$gds),
   " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
 
   let has_offset = 0;
@@ -311,8 +311,8 @@ class DS_BVH_STACK<string opName,
                    RegisterOperand vdst_rc,
                    RegisterOperand data1_rc>
 : DS_Pseudo<opName,
-  (outs vdst_rc:$vdst, VGPR_32:$addr),
-  (ins VGPR_32:$addr_in, VGPR_32:$data0, data1_rc:$data1, Offset:$offset),
+  (outs vdst_rc:$vdst, VGPROp_32:$addr),
+  (ins VGPROp_32:$addr_in, VGPROp_32:$data0, data1_rc:$data1, Offset:$offset),
   " $vdst, $addr, $data0, $data1$offset"> {
   let Constraints = "$addr = $addr_in";
   let has_gds = 0;
@@ -327,8 +327,8 @@ class DS_1A_RET<string opName, RegisterOperand data_op = AVLdSt_32,
 : DS_Pseudo<opName,
   (outs data_op:$vdst),
   !if(HasTiedOutput,
-    (ins VGPR_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in),
-    (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
+    (ins VGPROp_32:$addr, ofs:$offset, gds:$gds, data_op:$vdst_in),
+    (ins VGPROp_32:$addr, ofs:$offset, gds:$gds)),
   " $vdst, $addr$offset$gds"> {
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
   let has_data0 = 0;
@@ -366,7 +366,7 @@ class DS_1A_RET_Tied<string opName, RegisterOperand rc = AVLdSt_32> :
 class DS_1A_Off8_RET <string opName, RegisterOperand rc = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
-  (ins VGPR_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds),
+  (ins VGPROp_32:$addr, Offset0:$offset0, Offset1:$offset1, gds:$gds),
   " $vdst, $addr$offset0$offset1$gds"> {
 
   let has_offset = 0;
@@ -384,7 +384,7 @@ multiclass DS_1A_Off8_RET_mc <string opName, RegisterOperand rc = VGPROp_32> {
 
 class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
   (outs AVLdSt_32:$vdst),
-  (ins VGPR_32:$addr, Offset:$offset),
+  (ins VGPROp_32:$addr, Offset:$offset),
   " $vdst, $addr$offset gds"> {
 
   let has_data0 = 0;
@@ -396,7 +396,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
 class DS_1A_Off16_NORET <string opName>
 : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, Offset:$offset, gds:$gds),
+  (ins VGPROp_32:$addr, Offset:$offset, gds:$gds),
   " $addr$offset$gds"> {
 
   let has_vdst = 0;
@@ -422,7 +422,7 @@ class DS_0A_RET <string opName> : DS_Pseudo<opName,
 
 class DS_1A <string opName> : DS_Pseudo<opName,
   (outs),
-  (ins VGPR_32:$addr, Offset:$offset, gds:$gds),
+  (ins VGPROp_32:$addr, Offset:$offset, gds:$gds),
   " $addr$offset$gds"> {
 
   let mayLoad = 1;
@@ -463,7 +463,7 @@ class DS_GWS_0D <string opName>
 
 class DS_GWS_1D <string opName>
 : DS_GWS<opName,
-  (ins AVLdSt_32:$data0, Offset:$offset),
+  (ins AV_LdSt_32_Align2_RegOp:$data0, Offset:$offset),
   " $data0$offset gds"> {
 
   let has_gws_data0 = 1;
@@ -491,7 +491,7 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
                        RegisterOperand data_op = AVLdSt_32>
 : DS_Pseudo<opName,
   (outs data_op:$vdst),
-  (ins VGPR_32:$addr, data_op:$data0, Offset:$offset),
+  (ins VGPROp_32:$addr, data_op:$data0, Offset:$offset),
   " $vdst, $addr, $data0$offset",
   [(set i32:$vdst,
    (node (DS1Addr1Offset i32:$addr, i32:$offset), i32:$data0))] > {
@@ -886,17 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3
 def DS_BPERMUTE_FI_B32    : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
                                              int_amdgcn_ds_bpermute_fi_b32>;
 
-multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
-                                  ValueType vt, string frag> {
-  def : DSAtomicRetPat<inst, vt,
-                        !cast<PatFrag>(frag#"_local_addrspace")>;
-
-  let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-    def : DSAtomicRetPat<noRetInst, vt,
-                          !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
-}
-
-defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
 } // let SubtargetPredicate = isGFX12Plus
 
 let SubtargetPredicate = isGFX1250Plus in {
@@ -917,7 +906,7 @@ def DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_1A_Off16_NORET<"ds_atomic_async_barr
 
 def : GCNPat <
   (int_amdgcn_ds_atomic_async_barrier_arrive_b64 (DS1Addr1Offset i32:$ptr, i32:$offset)),
-  (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0))
+  (DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPROp_32:$ptr, Offset:$offset, (i1 0))
 >;
 
 defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VGPROp_64>;
@@ -943,7 +932,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore
 
 def : GCNPat <
   (int_amdgcn_ds_swizzle i32:$src, timm:$offset16),
-  (DS_SWIZZLE_B32 VGPR_32:$src, (as_i16timm $offset16), (i1 0))
+  (DS_SWIZZLE_B32 VGPROp_32:$src, (as_i16timm $offset16), (i1 0))
 >;
 
 class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
@@ -1279,6 +1268,14 @@ defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "ato
 defm : DSAtomicRetNoRetPat_NoM0_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
 }
 
+let SubtargetPredicate = HasAtomicDsCondSubClampInsts in {
+
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_NoM0_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = HasAtomicDsCondSubClampInsts
+
 let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
 defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
 }
@@ -1339,28 +1336,28 @@ def : GCNPat <
 
 def : GCNPat <
   (i64 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
-  (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+  (DS_ADD_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32))
 >;
 
 def : GCNPat <
   (i32 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)),
   (EXTRACT_SUBREG
     (i64 (COPY_TO_REGCLASS
-      (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+      (DS_ADD_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)),
       VReg_64)),
     sub0)
 >;
 
 def : GCNPat <
   (i64 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
-  (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32))
+  (DS_SUB_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32))
 >;
 
 def : GCNPat <
   (i32 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)),
   (EXTRACT_SUBREG
     (i64 (COPY_TO_REGCLASS
-      (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)),
+      (DS_SUB_GS_REG_RTN VGPROp_32:$src, (as_i32timm $offset32)),
       VReg_64)),
     sub0)
 >;
@@ -1488,6 +1485,12 @@ let AssemblerPredicate = isGFX12Plus in {
 def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
 def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
 
+// Additional aliases for ds load transpose instructions.
+def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
 //===----------------------------------------------------------------------===//
 // GFX11.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e0375ea..b2dfd09 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -528,12 +528,26 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
         break;
       case AMDGPU::OPERAND_REG_IMM_FP16:
       case AMDGPU::OPERAND_REG_IMM_INT16:
-      case AMDGPU::OPERAND_REG_IMM_V2FP16:
       case AMDGPU::OPERAND_REG_INLINE_C_FP16:
       case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+        Imm = getInlineImmValF16(Imm);
+        break;
+      case AMDGPU::OPERAND_REG_IMM_V2FP16:
       case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
         Imm = getInlineImmValF16(Imm);
         break;
+      case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: {
+        // V_PK_FMAC_F16 on GFX11+ duplicates the f16 inline constant to both
+        // halves, so we need to produce the duplicated value for correct
+        // round-trip.
+        if (isGFX11Plus()) {
+          int64_t F16Val = getInlineImmValF16(Imm);
+          Imm = (F16Val << 16) | (F16Val & 0xFFFF);
+        } else {
+          Imm = getInlineImmValF16(Imm);
+        }
+        break;
+      }
       case AMDGPU::OPERAND_REG_IMM_FP64:
       case AMDGPU::OPERAND_REG_IMM_INT64:
       case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
@@ -566,7 +580,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
     // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
     // encodings
-    if (isGFX1250() && Bytes.size() >= 16) {
+    if (isGFX1250Plus() && Bytes.size() >= 16) {
       std::bitset<128> DecW = eat16Bytes(Bytes);
       if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
         break;
@@ -595,6 +609,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
           tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
         break;
 
+      if (isGFX13() &&
+          tryDecodeInst(DecoderTableGFX1396, DecoderTableGFX13_FAKE1696, MI,
+                        DecW, Address, CS))
+        break;
+
       if (STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
         // Return 8 bytes for a potential literal.
         Bytes = Bytes_.slice(4, MaxInstBytesNum - 4);
@@ -680,6 +699,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
           tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
         break;
 
+      if (isGFX13() &&
+          tryDecodeInst(DecoderTableGFX1364, DecoderTableGFX13_FAKE1664, MI, QW,
+                        Address, CS))
+        break;
+
       // Reinitialize Bytes
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
     }
@@ -727,6 +751,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
           tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
                         Address, CS))
         break;
+
+      if (isGFX13() &&
+          tryDecodeInst(DecoderTableGFX1332, DecoderTableGFX13_FAKE1632, MI, DW,
+                        Address, CS))
+        break;
     }
 
     return MCDisassembler::Fail;
@@ -892,6 +921,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   // have EXEC as implicit destination. Issue a warning if encoding for
   // vdst is not EXEC.
   if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) &&
+      MCII->get(MI.getOpcode()).getNumDefs() == 0 &&
       MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) {
     auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO);
     if (Bytes_[0] != ExecEncoding)
@@ -1198,8 +1228,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
 
 // Given a wide tuple \p Reg check if it will overflow 256 registers.
 // \returns \p Reg on success or NoRegister otherwise.
-static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
-                                  const MCRegisterInfo &MRI) {
+static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC,
+                                    const MCRegisterInfo &MRI) {
   unsigned NumRegs = RC.getSizeInBits() / 32;
   MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
   if (!Sub0)
@@ -1213,7 +1243,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
 
   assert(BaseReg && "Only vector registers expected");
 
-  return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
+  return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister();
 }
 
 // Note that before gfx10, the MIMG encoding provided no information about
@@ -1455,9 +1485,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V,
   return MCOperand();
 }
 
-inline
-MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
-  return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
+inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const {
+  return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI));
 }
 
 inline
@@ -1597,6 +1626,9 @@ AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
     UseLit = AMDGPU::isInlinableLiteralV2F16(Val);
     break;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+    UseLit = AMDGPU::isPKFMACF16InlineConstant(Val, isGFX11Plus());
+    break;
   case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
     break;
   case AMDGPU::OPERAND_REG_IMM_INT16:
@@ -2225,6 +2257,16 @@ bool AMDGPUDisassembler::isGFX12Plus() const {
 
 bool AMDGPUDisassembler::isGFX1250() const { return AMDGPU::isGFX1250(STI); }
 
+bool AMDGPUDisassembler::isGFX1250Plus() const {
+  return AMDGPU::isGFX1250Plus(STI);
+}
+
+bool AMDGPUDisassembler::isGFX13() const { return AMDGPU::isGFX13(STI); }
+
+bool AMDGPUDisassembler::isGFX13Plus() const {
+  return AMDGPU::isGFX13Plus(STI);
+}
+
 bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
   return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
 }
@@ -2381,7 +2423,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
   }
 
   // Bits [27].
-  if (isGFX1250()) {
+  if (isGFX1250Plus()) {
     PRINT_PSEUDO_DIRECTIVE_COMMENT("FLAT_SCRATCH_IS_NV",
                                    COMPUTE_PGM_RSRC1_GFX125_FLAT_SCRATCH_IS_NV);
   } else {
@@ -2395,7 +2437,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
   // Bits [29-31].
   if (isGFX10Plus()) {
     // WGP_MODE is not available on GFX1250.
-    if (!isGFX1250()) {
+    if (!isGFX1250Plus()) {
       PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
                       COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
     }
@@ -2526,7 +2568,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
     }
 
     // Bits [14-21].
-    if (isGFX1250()) {
+    if (isGFX1250Plus()) {
       PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
                       COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
       PRINT_PSEUDO_DIRECTIVE_COMMENT(
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d103d79..28f71d8 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -69,7 +69,7 @@ public:
 
   const char* getRegClassName(unsigned RegClassID) const;
 
-  MCOperand createRegOperand(unsigned int RegId) const;
+  MCOperand createRegOperand(MCRegister Reg) const;
   MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
   MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
   MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const;
@@ -182,6 +182,9 @@ public:
   bool isGFX12() const;
   bool isGFX12Plus() const;
   bool isGFX1250() const;
+  bool isGFX1250Plus() const;
+  bool isGFX13() const;
+  bool isGFX13Plus() const;
 
   bool hasArchitectedFlatScratch() const;
   bool hasKernargPreload() const;
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index dadc7dc..a2e3ece 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -505,7 +505,6 @@ def : AMDGPUPat <
   (fshr i32:$src0, i32:$src1, i32:$src2),
   (BIT_ALIGN_INT_eg $src0, $src1, $src2)
 >;
-def : ROTRPattern <BIT_ALIGN_INT_eg>;
 def MULADD_eg : MULADD_Common<0x14>;
 def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
 def FMA_eg : FMA_Common<0x7>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6de59be..63460b5 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -229,13 +229,13 @@ class GlobalSaddrTable <bit is_saddr, string Name = ""> {
 class FLAT_Load_Pseudo<
     string opName, RegisterOperand vdata_op, bit HasTiedOutput = 0,
     bit HasSaddr = 0, bit EnableSaddr = 0,
-    RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
+    RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)>
     : FLAT_Pseudo<opName, (outs), (ins), ""> {
 
   let OutOperandList = (outs vdata_op:$vdst);
   let InOperandList = !con(
     !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
-    (ins VaddrRC:$vaddr, flat_offset:$offset),
+    (ins VaddrOp:$vaddr, flat_offset:$offset),
     // FIXME: Operands with default values do not work with following
     // non-optional operands.
     !if(HasTiedOutput, (ins CPol:$cpol, vdata_op:$vdst_in),
@@ -262,15 +262,25 @@ multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterOperand regClass = AVLdS
 
 multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
   defm "" : FLAT_Flat_Load_Pseudo<opName, AVLdSt_32, 1>;
-  let True16Predicate = UseRealTrue16Insts in
-    defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPROp_16>, True16D16Table<NAME#"_HI", NAME>;
+
+  defvar Name16 = opName#"_t16";
+  let True16Predicate = UseRealTrue16Insts in {
+    def _t16 : FLAT_Load_Pseudo<Name16, VGPROp_16>,
+      GlobalSaddrTable<0, Name16>,
+      True16D16Table<NAME#"_HI", NAME>;
+
+    let OtherPredicates = [HasFlatGVSMode] in
+    def _t16_SADDR : FLAT_Load_Pseudo<Name16, VGPROp_16, 0, 1, 1>,
+      GlobalSaddrTable<1, Name16>,
+      True16D16Table<NAME#"_HI_SADDR", NAME#"_SADDR">;
+  }
 }
 
 class FLAT_Store_Pseudo <string opName, RegisterOperand vdataClass,
   bit HasSaddr = 0, bit EnableSaddr = 0,
-  RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)> : FLAT_Pseudo<opName, (outs), (ins), ""> {
+  RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)> : FLAT_Pseudo<opName, (outs), (ins), ""> {
   let InOperandList = !con(
-      (ins VaddrRC:$vaddr, vdataClass:$vdata),
+      (ins VaddrOp:$vaddr, vdataClass:$vdata),
       !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
       (ins flat_offset:$offset, CPol_0:$cpol));
   let AsmOperands = " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$cpol";
@@ -380,15 +390,16 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
 
 // Async loads, introduced in gfx1250, will store directly
 // to a DS address in vdst (they will not use M0 for DS addess).
-class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> : FLAT_Pseudo<
   opName,
   (outs ),
   !con(
-       !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
-       !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)),
-       (ins flat_offset:$offset, CPol_0:$cpol)),
+       !if(IsAsync, (ins VGPROp_32:$vdst), (ins)),
+       !if(EnableSaddr, (ins SReg_64:$saddr, VGPROp_32:$vaddr), (ins VGPROp_64:$vaddr)),
+       (ins flat_offset:$offset, CPol_0:$cpol),
+       !if(IsLegacyLDSDMA, (ins i1imm_0:$IsAsync), (ins))),
   !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
-  let LGKM_CNT = !not(IsAsync);
+  let LGKM_CNT = 0;
   let VM_CNT = !not(IsAsync);
   let ASYNC_CNT = IsAsync;
   let is_flat_global = 1;
@@ -406,10 +417,10 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy
   let SchedRW = [WriteVMEM, WriteLDS];
 }
 
-multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
-  def ""     : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> {
+  def ""     : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync, IsLegacyLDSDMA>,
     GlobalSaddrTable<0, opName>;
-  def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
+  def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync, IsLegacyLDSDMA>,
     GlobalSaddrTable<1, opName>;
 }
 
@@ -417,7 +428,7 @@ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_P
   opName,
   (outs ),
   !con(
-      !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), (ins VGPR_32:$vdata),
+      !if(EnableSaddr, (ins SReg_64:$saddr, VGPROp_32:$vaddr), (ins VGPROp_64:$vaddr)), (ins VGPROp_32:$vdata),
       (ins flat_offset:$offset, CPol_0:$cpol)),
   " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
   let VM_CNT = 0;
@@ -511,7 +522,7 @@ class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = n
   let sve = 0;
 }
 
-class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$vaddr), string asm = " $vaddr"> :
+class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VGPROp_64:$vaddr), string asm = " $vaddr"> :
   FLAT_Pseudo<opName, (outs), !con(addr, (ins flat_offset:$offset, CPol_0:$cpol)), asm#"$offset$cpol"> {
   let has_vdst = 0;
   let has_data = 0;
@@ -524,7 +535,7 @@ class FLAT_Prefetch_Pseudo<string opName, dag addr = (ins VReg_64_AlignTarget:$v
 multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
   def "" : FLAT_Prefetch_Pseudo<opName>,
     GlobalSaddrTable<0, opName>;
-  def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+  def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPROp_32:$vaddr), " $vaddr, $saddr">,
     GlobalSaddrTable<1, opName> {
     let OtherPredicates = [HasFlatGVSMode];
     let enabled_saddr = 1;
@@ -533,9 +544,9 @@ multiclass FLAT_Flat_Prefetch_Pseudo<string opName> {
 
 multiclass FLAT_Global_Prefetch_Pseudo<string opName> {
   let is_flat_global = 1, has_saddr = 1 in {
-    def "" : FLAT_Prefetch_Pseudo<opName, (ins VReg_64_AlignTarget:$vaddr), " $vaddr, off">,
+    def "" : FLAT_Prefetch_Pseudo<opName, (ins VGPROp_64:$vaddr), " $vaddr, off">,
       GlobalSaddrTable<0, opName>;
-    def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPR_32:$vaddr), " $vaddr, $saddr">,
+    def _SADDR : FLAT_Prefetch_Pseudo<opName, (ins SReg_64:$saddr, VGPROp_32:$vaddr), " $vaddr, $saddr">,
       GlobalSaddrTable<1, opName> {
       let enabled_saddr = 1;
     }
@@ -557,11 +568,11 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterOperand regClass = AVLdSt
   (outs regClass:$vdst),
   !con(
     !if(EnableSVE,
-        (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
+        (ins VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
         !if(EnableSaddr,
           (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset),
           !if(EnableVaddr,
-            (ins VGPR_32:$vaddr, flat_offset:$offset),
+            (ins VGPROp_32:$vaddr, flat_offset:$offset),
             (ins flat_offset:$offset)))),
      !if(HasTiedOutput, (ins CPol:$cpol, regClass:$vdst_in),
                         (ins CPol_0:$cpol))),
@@ -584,11 +595,11 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterOperand vdata_op, bit En
   opName,
   (outs),
   !if(EnableSVE,
-    (ins vdata_op:$vdata, VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
+    (ins vdata_op:$vdata, VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
     !if(EnableSaddr,
       (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol),
       !if(EnableVaddr,
-        (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
+        (ins vdata_op:$vdata, VGPROp_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
         (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))),
   " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
   let is_flat_scratch = 1;
@@ -687,11 +698,11 @@ class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0,
   opName,
   (outs ),
   !if(EnableSVE,
-    (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
+    (ins VGPROp_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
     !if(EnableSaddr,
       (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol),
       !if(EnableVaddr,
-        (ins VGPR_32:$vaddr, flat_offset:$offset, CPol:$cpol),
+        (ins VGPROp_32:$vaddr, flat_offset:$offset, CPol:$cpol),
         (ins flat_offset:$offset, CPol:$cpol)))),
   " "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
 
@@ -754,7 +765,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
   RegisterOperand data_op = vdst_op> {
   def "" : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VReg_64_AlignTarget:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+    (ins VGPROp_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
     " $vaddr, $vdata$offset$cpol">,
     GlobalSaddrTable<0, opName> {
     let FPAtomic = data_vt.isFP;
@@ -763,7 +774,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
 
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
-    (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+    (ins VGPROp_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
     " $vaddr, $vdata, $saddr$offset$cpol">,
     GlobalSaddrTable<1, opName> {
     let OtherPredicates = [HasFlatGVSMode];
@@ -786,7 +797,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_op_vgpr:$vdst),
-    (ins VReg_64_AlignTarget:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+    (ins VGPROp_64:$vaddr, data_op_vgpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
     " $vdst, $vaddr, $vdata$offset$cpol">,
     GlobalSaddrTable<0, opName#"_rtn"> {
     let FPAtomic = data_vt.isFP;
@@ -795,7 +806,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_op_vgpr:$vdst),
-      (ins VGPR_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+      (ins VGPROp_32:$vaddr, data_op_vgpr:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
     " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
     GlobalSaddrTable<1, opName#"_rtn"> {
     let OtherPredicates = [HasFlatGVSMode];
@@ -811,7 +822,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
 
   def _RTN_agpr : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_op_agpr:$vdst),
-    (ins VReg_64_AlignTarget:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+    (ins VGPROp_64:$vaddr, data_op_agpr:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
     " $vdst, $vaddr, $vdata$offset$cpol">,
     GlobalSaddrTable<0, opName#"_rtn_agpr"> {
     let FPAtomic = data_vt.isFP;
@@ -837,10 +848,10 @@ class FLAT_Global_Atomic_Pseudo_NO_RTN<
   ValueType data_vt = vt,
   RegisterOperand data_op = vdst_op,
   bit EnableSaddr = false,
-  RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
+  RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)>
     : FLAT_AtomicNoRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName> {
   let InOperandList = !con(
-    (ins VaddrRC:$vaddr, data_op:$vdata),
+    (ins VaddrOp:$vaddr, data_op:$vdata),
     !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
     (ins flat_offset:$offset, CPol_0:$cpol));
   let AsmOperands = " $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol";
@@ -867,7 +878,7 @@ class FLAT_Global_Atomic_Pseudo_RTN<
   RegisterOperand data_op = vdst_op,
   bit EnableSaddr = false,
   bit IsVGPR = false,
-  RegisterClassLike VaddrRC = !if(EnableSaddr, VGPR_32, VReg_64_AlignTarget)>
+  RegisterOperand VaddrOp = !if(EnableSaddr, VGPROp_32, VGPROp_64)>
     : FLAT_AtomicRet_Pseudo<opName, (outs), (ins), "">, GlobalSaddrTable<EnableSaddr, opName#"_rtn"#!if(IsVGPR, "", "_agpr")> {
 
   defvar vdst_rc= !if(IsVGPR, getEquivalentVGPROperand<vdst_op>.ret, getEquivalentAGPROperand<vdst_op>.ret);
@@ -875,7 +886,7 @@ class FLAT_Global_Atomic_Pseudo_RTN<
 
   let OutOperandList = (outs vdst_rc:$vdst);
   let InOperandList = !con(
-    (ins VaddrRC:$vaddr, data_rc:$vdata),
+    (ins VaddrOp:$vaddr, data_rc:$vdata),
     !if(EnableSaddr, (ins SReg_64_XEXEC_XNULL:$saddr), (ins)),
     (ins flat_offset:$offset, CPol_GLC1:$cpol));
   let AsmOperands = " $vdst, $vaddr, $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol";
@@ -1202,15 +1213,15 @@ let SubtargetPredicate = HasGFX10_BEncoding in {
                                 VGPROp_32, i32>;
 }
 
-defm GLOBAL_LOAD_LDS_UBYTE  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
-defm GLOBAL_LOAD_LDS_SBYTE  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">;
-defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">;
-defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
-defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
+defm GLOBAL_LOAD_LDS_UBYTE  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte", 0, 1>;
+defm GLOBAL_LOAD_LDS_SBYTE  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte", 0, 1>;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort", 0, 1>;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort", 0, 1>;
+defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword", 0, 1>;
 
 let SubtargetPredicate = HasGFX950Insts in {
-defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
-defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3", 0, 1>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4", 0, 1>;
 }
 
 let SubtargetPredicate = isGFX12PlusNot12_50 in
@@ -1224,7 +1235,7 @@ let SubtargetPredicate = isGFX12Plus in {
   def GLOBAL_WBINV  : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
 } // End SubtargetPredicate = isGFX12Plus
 
-let SubtargetPredicate = isGFX1250Plus in {
+let SubtargetPredicate = HasMcastLoadInsts in {
 
 let Uses = [M0, EXEC, ASYNCcnt], WaveSizePredicate = isWave32 in {
 defm CLUSTER_LOAD_ASYNC_TO_LDS_B8      :  FLAT_Global_Load_LDS_Pseudo<"cluster_load_async_to_lds_b8",   1>;
@@ -1243,7 +1254,7 @@ defm GLOBAL_STORE_ASYNC_FROM_LDS_B128  :  FLAT_Global_STORE_LDS_Pseudo<"global_s
 
 def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
 def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
-} // End SubtargetPredicate = isGFX1250Plus
+} // End SubtargetPredicate = HasMcastLoadInsts
 
 defm SCRATCH_LOAD_UBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte">;
 defm SCRATCH_LOAD_SBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte">;
@@ -1404,62 +1415,62 @@ class FlatSignedLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType
 >;
 
 class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
   (inst $saddr, $voffset, $offset, $cpol, $in)
 >;
 
 class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
   (inst $saddr, $voffset, $offset, $cpol, $in)
 >;
 
 class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
 class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
   (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
 >;
 
 class FlatLoadLDSSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
-  (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0),
+  (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol), M0),
   (inst $dsaddr, $vaddr, $offset, $cpol)
 >;
 
 class GlobalLoadLDSSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
-  (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0),
+  (node (GlobalSAddrNoIOffsetM0 (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm), M0),
   (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
 >;
 
 class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
-  (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+  (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
   (inst $dsaddr, $vaddr, $offset, $cpol)
 >;
 
 class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
-  (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+  (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
   (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
 >;
 
 class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
-  (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+  (node (i64 VReg_64:$vaddr), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
   (inst $vaddr, $dsaddr, $offset, $cpol)
 >;
 
 class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
-  (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+  (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), CPol:$cpol), (i32 VGPROp_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
   (inst $saddr, $voffset, $dsaddr, $offset, $cpol)
 >;
 
 class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
 class GlobalLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
   (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
 >;
 
@@ -1469,7 +1480,7 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
 >;
 
 class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol))),
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
@@ -1479,7 +1490,7 @@ class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType
 >;
 
 class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
+  (vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
@@ -1489,19 +1500,19 @@ class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
 >;
 
 class GlobalLoadSaddrPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
+  (vt (node (GlobalSAddrCPol (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm))),
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
 class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
                          ValueType vt> : GCNPat <
-  (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
+  (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol)),
   (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
 >;
 
 class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
                           ValueType vt, ValueType data_vt = vt> : GCNPat <
-  (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
+  (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
   (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)> {
   let SubtargetPredicate = inst.SubtargetPredicate;
   let OtherPredicates = inst.OtherPredicates;
@@ -1509,7 +1520,7 @@ class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPatte
 
 class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
                                  ValueType vt> : GCNPat <
-  (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data),
+  (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset, CPol:$cpol), vt:$data),
   (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
 >;
 
@@ -1539,7 +1550,7 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
 
   let AddedComplexity = 1 in
   def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
-    (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+    (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
     let SubtargetPredicate = inst.SubtargetPredicate;
     let OtherPredicates = inst.OtherPredicates;
   }
@@ -1552,10 +1563,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
   }
 }
 
-multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
-                                           ValueType vt> :
-  FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
-
 multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
                           ValueType data_vt = vt, bit isIntr = 0> :
   FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1568,7 +1575,7 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
   defvar rtnNode = !cast<SDPatternOperator>(node);
 
   def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
-    (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+    (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
     let SubtargetPredicate = inst.SubtargetPredicate;
     let OtherPredicates = inst.OtherPredicates;
   }
@@ -1580,10 +1587,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
   }
 }
 
-multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
-                                         ValueType vt> :
-  FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
-
 multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
                              ValueType data_vt = vt, bit isIntr = 0> :
   FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1612,7 +1615,7 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
 class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
                                ValueType vt, ValueType data_vt = vt> : GCNPat <
   (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
-  (inst VReg_64_AlignTarget:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+  (inst VGPROp_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
   let SubtargetPredicate = inst.SubtargetPredicate;
   let OtherPredicates = inst.OtherPredicates;
 }
@@ -1631,27 +1634,27 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
 }
 
 class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+  (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))),
   (inst $vaddr, $offset)
 >;
 
 class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset), vt:$in),
+  (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset), vt:$in),
   (inst $vaddr, $offset, 0, $in)
 >;
 
 class ScratchLoadSignedPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+  (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))),
   (inst $vaddr, $offset, 0)
 >;
 
 class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+  (vt (node (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset))),
   (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
 >;
 
 class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)),
+  (node vt:$data, (ScratchOffset (i32 VGPROp_32:$vaddr), i32:$offset)),
   (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
 >;
 
@@ -1682,28 +1685,28 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
 >;
 
 class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+  (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
   (inst $vaddr, $saddr, $offset, $cpol)
 >;
 
 class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
                              ValueType vt> : GCNPat <
-  (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
+  (node vt:$data, (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
   (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol)
 >;
 
 class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
+  (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
   (inst $vaddr, $saddr, $offset, $cpol, $in)
 >;
 
 class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+  (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
   (inst $vaddr, $saddr, $offset, $cpol)
 >;
 
 class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+  (vt (node (ScratchSVAddr (i32 VGPROp_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
   (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16)
 >;
 
@@ -2169,14 +2172,16 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
 }
 
+let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in {
+  defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"  #as, i32 >;
+
+  defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
 } // end foreach as
 
 defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
 
-defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
 let OtherPredicates = [HasD16LoadStore] in {
 defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
@@ -2340,10 +2345,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
 
 let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
 
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -2360,10 +2365,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
 
 let SubtargetPredicate = isGFX12Plus in {
-  defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
-
-  let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
-    defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace  <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace",  i32>;
+  defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
+  defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
 }
 
 let OtherPredicates = [isGFX12PlusNot12_50] in
@@ -2387,13 +2390,13 @@ let WaveSizePredicate = isWave32,  OtherPredicates = [HasTransposeLoadF4F6Insts]
 }
 
 let OtherPredicates = [isGFX125xOnly] in {
-  def  : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B32,  int_amdgcn_flat_load_monitor_b32,  i32>;
-  def  : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B64,  int_amdgcn_flat_load_monitor_b64,  v2i32>;
-  def  : FlatLoadPat_CPOL <FLAT_LOAD_MONITOR_B128, int_amdgcn_flat_load_monitor_b128, v4i32>;
+  def  : FlatLoadPat <FLAT_LOAD_MONITOR_B32,  AMDGPUflat_load_monitor,  i32>;
+  def  : FlatLoadPat <FLAT_LOAD_MONITOR_B64,  AMDGPUflat_load_monitor,  v2i32>;
+  def  : FlatLoadPat <FLAT_LOAD_MONITOR_B128, AMDGPUflat_load_monitor, v4i32>;
 
-  defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B32,  int_amdgcn_global_load_monitor_b32,  i32>;
-  defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B64,  int_amdgcn_global_load_monitor_b64,  v2i32>;
-  defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B32,  AMDGPUglobal_load_monitor,  i32>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B64,  AMDGPUglobal_load_monitor,  v2i32>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_MONITOR_B128, AMDGPUglobal_load_monitor, v4i32>;
 } // End SubtargetPredicate = isGFX125xOnly
 
 let OtherPredicates = [isGFX1250Plus] in {
@@ -2450,7 +2453,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16
 
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
 
-let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [HasFlatScratchInsts, HasFlatScratchEnabled] in {
 
 defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i32>;
 defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i32>;
@@ -2508,12 +2511,12 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i32>;
 defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
 defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
 
-let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, HasFlatScratchEnabled] in {
 defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
 defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
 }
 
-let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, HasFlatScratchEnabled] in {
 defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
 defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
 defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;
@@ -2529,7 +2532,7 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2i
 defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f16>;
 }
 
-} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
+} // End OtherPredicates = [HasFlatScratchInsts,HasFlatScratchEnabled]
 
 def PrefetchLoc: SDNodeXForm<timm, [{
   uint32_t V = N->getZExtValue();
@@ -2568,7 +2571,7 @@ multiclass FlatPrefetchPats<string inst, SDPatternOperator prefetch_kind, SDPatt
   }
 
   def : GCNPat <
-    (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
+    (prefetch_kind (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset), rw, (i32 timm:$loc), i32imm_one),
     (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, (i32 (PrefetchLoc $loc)))
   > {
     let AddedComplexity = !if(!eq(rw, i32imm_zero), 11, 30);
@@ -2582,7 +2585,7 @@ multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
   >;
 
   def : GCNPat <
-    (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
+    (intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPROp_32:$voffset), i32:$offset), timm:$cpol),
     (!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
       let AddedComplexity = 11;
     }
@@ -3642,17 +3645,6 @@ multiclass VFLAT_Real_gfx1250<bits<8> op,
   }
 }
 
-multiclass VFLAT_Aliases_gfx1250<string name> {
-  defvar ps = get_FLAT_ps<NAME>;
-  if !ne(ps.Mnemonic, name) then
-    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX125xOnly]>;
-}
-
-multiclass VFLAT_Real_Base_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
-  VFLAT_Aliases_gfx1250<name> {
-  defm "" : VFLAT_Real_gfx1250<op, name>;
-}
-
 multiclass VFLAT_Real_RTN_gfx1250<bits<8> op, string name> {
   defm _RTN : VFLAT_Real_gfx1250<op, name>;
 }
@@ -3665,9 +3657,14 @@ multiclass VFLAT_Real_SADDR_RTN_gfx1250<bits<8> op, string name> {
   defm _SADDR_RTN : VFLAT_Real_gfx1250<op, name>;
 }
 
-multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
-  VFLAT_Real_Base_gfx1250<op, name>,
-  VFLAT_Real_SADDR_gfx1250<op, name>;
+multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic,
+                                      list<Predicate> aliasPreds = [isGFX125xOnly]> :
+  VFLAT_Real_gfx1250<op, name>,
+  VFLAT_Real_SADDR_gfx1250<op, name> {
+  defvar ps = get_FLAT_ps<NAME>;
+  if !ne(ps.Mnemonic, name) then
+    def : MnemonicAlias<ps.Mnemonic, name>, Requires<aliasPreds>;
+}
 
 multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
   VFLAT_Real_AllAddr_gfx1250<op, name>,
@@ -3711,6 +3708,12 @@ defm GLOBAL_LOAD_TR_B64_w32           : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
 defm GLOBAL_LOAD_TR4_B64              : VFLAT_Real_AllAddr_gfx1250<0x073>;
 defm GLOBAL_LOAD_TR6_B96              : VFLAT_Real_AllAddr_gfx1250<0x074>;
 
+// Additional aliases for global load transpose instructions.
+def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
 defm FLAT_ATOMIC_ADD_F64              : VFLAT_Real_Atomics_gfx1250<0x055>;
 defm FLAT_ATOMIC_MIN_F64              : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
 defm FLAT_ATOMIC_MAX_F64              : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 27f40f1..72805aa 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -134,6 +134,7 @@ public:
     LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
 
     const SIInstrInfo *SII = ST->getInstrInfo();
+    const SIRegisterInfo *TRI = ST->getRegisterInfo();
     bool Changed = false;
     unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(*ST);
     bool HasVOPD3 = ST->hasVOPD3();
@@ -160,16 +161,25 @@ public:
           llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD =
               AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);
 
-          if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+          if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y &&
+              llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI, VOPD3)) {
             CI = VOPDCombineInfo(FirstMI, SecondMI, VOPD3);
-          else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+            return true;
+          }
+          // We can try swapping the order of the instructions, but in that case
+          // neither instruction can write to a register the other reads from.
+          // OpX cannot write something OpY reads because that is the hardware
+          // rule, and OpY cannot write what OpX reads because that would
+          // violate the data dependency in the original order.
+          for (const auto &Use : SecondMI->uses())
+            if (Use.isReg() && FirstMI->modifiesRegister(Use.getReg(), TRI))
+              return false;
+          if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X &&
+              llvm::checkVOPDRegConstraints(*SII, *SecondMI, *FirstMI, VOPD3)) {
             CI = VOPDCombineInfo(SecondMI, FirstMI, VOPD3);
-          else
-            return false;
-          // checkVOPDRegConstraints cares about program order, but doReplace
-          // cares about X-Y order in the constituted VOPD
-          return llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI,
-                                               VOPD3);
+            return true;
+          }
+          return false;
         };
 
         if (checkVOPD(false) || (HasVOPD3 && checkVOPD(true))) {
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 464cbec..6ba669f 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -43,6 +43,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 
 using namespace llvm;
 
@@ -256,7 +257,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
               TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
           *MRI));
       auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
-      DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
+      DPPInst.addReg(CombOldVGPR.Reg, getUndefRegState(!Def),
                      CombOldVGPR.SubReg);
       ++NumOperands;
     } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a911e7e..d504d86 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -16,6 +16,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/TargetParser/TargetParser.h"
 
@@ -190,6 +191,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (checkFPAtomicToDenormModeHazard(MI) > 0)
     return HazardType;
 
+  // Hazards which cannot be mitigated with S_NOPs.
+  if (!IsHazardRecognizerMode) {
+    if (checkWMMACoexecutionHazards(MI) > 0)
+      return Hazard;
+  }
+
   if (ST.hasNoDataDepHazard())
     return NoHazard;
 
@@ -435,10 +442,7 @@ void GCNHazardRecognizer::RecedeCycle() {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
-
-using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
-using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
+enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
 
 // Search for a hazard in a block and its predecessors.
 template <typename StateT>
@@ -546,11 +550,14 @@ hasHazard(StateT InitialState,
 // Returns a minimum wait states since \p I walking all predecessors.
 // Only scans until \p IsExpired does not return true.
 // Can only be run in a hazard recognizer mode.
-static int getWaitStatesSince(
-    GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
-    MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
-    IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
-    GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+                   const MachineBasicBlock *MBB,
+                   MachineBasicBlock::const_reverse_instr_iterator I,
+                   int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
+                   DenseSet<const MachineBasicBlock *> &Visited,
+                   GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+                       SIInstrInfo::getNumWaitStates) {
   for (auto E = MBB->instr_rend(); I != E; ++I) {
     // Don't add WaitStates for parent BUNDLE instructions.
     if (I->isBundle())
@@ -582,20 +589,26 @@ static int getWaitStatesSince(
   return MinWaitStates;
 }
 
-static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
-                              const MachineInstr *MI, IsExpiredFn IsExpired) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+                   const MachineInstr *MI,
+                   GCNHazardRecognizer::IsExpiredFn IsExpired,
+                   GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+                       SIInstrInfo::getNumWaitStates) {
   DenseSet<const MachineBasicBlock *> Visited;
   return getWaitStatesSince(IsHazard, MI->getParent(),
                             std::next(MI->getReverseIterator()), 0, IsExpired,
-                            Visited, SIInstrInfo::getNumWaitStates);
+                            Visited, GetNumWaitStates);
 }
 
-int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+int GCNHazardRecognizer::getWaitStatesSince(
+    IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
   if (IsHazardRecognizerMode) {
     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
       return WaitStates >= Limit;
     };
-    return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+    return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
+                                GetNumWaitStates);
   }
 
   int WaitStates = 0;
@@ -607,7 +620,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
       if (MI->isInlineAsm())
         continue;
     }
-    ++WaitStates;
+    WaitStates += MI ? GetNumWaitStates(*MI) : 1;
 
     if (WaitStates >= Limit)
       break;
@@ -615,6 +628,10 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
   return std::numeric_limits<int>::max();
 }
 
+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+  return getWaitStatesSince(IsHazard, Limit, SIInstrInfo::getNumWaitStates);
+}
+
 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
                                                IsHazardFn IsHazardDef,
                                                int Limit) {
@@ -643,7 +660,7 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
                         MCRegister Reg) {
   for (MCRegUnit Unit : TRI.regunits(Reg))
-    BV.set(Unit);
+    BV.set(static_cast<unsigned>(Unit));
 }
 
 static void addRegsToSet(const SIRegisterInfo &TRI,
@@ -1243,6 +1260,20 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
 }
 
+// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
+// to insert, negative means not needed.
+bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
+  if (WaitStatesNeeded <= 0)
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  for (int I = 0; I < WaitStatesNeeded; ++I)
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            TII->get(AMDGPU::V_NOP_e32));
+
+  return true;
+}
+
 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixVMEMtoScalarWriteHazards(MI);
   fixVcmpxPermlaneHazards(MI);
@@ -1257,7 +1288,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixVALUTransUseHazard(MI);
   fixVALUTransCoexecutionHazards(MI);
   fixWMMAHazards(MI); // fall-through if co-execution is enabled.
-  fixWMMACoexecutionHazards(MI);
+  emitVNops(MI, checkWMMACoexecutionHazards(MI));
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
   fixRequiredExportPriority(MI);
@@ -1306,8 +1337,8 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
   bool IsUndef = Src0->isUndef();
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(AMDGPU::V_MOV_B32_e32))
-    .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
-    .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
+      .addReg(Reg, RegState::Define | getDeadRegState(IsUndef))
+      .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
 
   return true;
 }
@@ -1354,7 +1385,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
   return true;
 }
 
@@ -1487,7 +1518,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
 
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+      .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
   return true;
 }
 
@@ -1502,9 +1533,8 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
   bool HasVmem = false;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
-      HasLds |= SIInstrInfo::isDS(MI);
-      HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
-                 SIInstrInfo::isSegmentSpecificFLAT(MI);
+      HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
+      HasVmem |= SIInstrInfo::isVMEM(MI);
       if (HasLds && HasVmem)
         return true;
     }
@@ -1526,10 +1556,9 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
   assert(!ST.hasExtendedWaitCounts());
 
   auto IsHazardInst = [](const MachineInstr &MI) {
-    if (SIInstrInfo::isDS(MI))
+    if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
       return 1;
-    if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
-        SIInstrInfo::isSegmentSpecificFLAT(MI))
+    if (SIInstrInfo::isVMEM(MI))
       return 2;
     return 0;
   };
@@ -1653,7 +1682,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
   } else {
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
             TII.get(AMDGPU::S_WAITCNT_DEPCTR))
-        .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+        .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
   }
 
   return true;
@@ -1811,7 +1840,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
 
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+      .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
 
   return true;
 }
@@ -1897,13 +1926,13 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
   // avoided.
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+      .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
 
   return true;
 }
 
 bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
-  if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
+  if (!ST.hasGFX1250Insts() || // Coexecution disabled.
       !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
     return false;
 
@@ -2047,13 +2076,13 @@ static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
   return false;
 }
 
-bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
-  if (!AMDGPU::isGFX1250(ST))
-    return false;
+int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
+  if (!ST.hasGFX1250Insts())
+    return 0;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
   if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
-    return false;
+    return 0;
 
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
@@ -2131,9 +2160,6 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
   };
 
   int Limit = 0;
-  auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
-    return WaitStates >= Limit;
-  };
 
   auto GetWaitStatesFn = [](const MachineInstr &I) {
     return SIInstrInfo::isVALU(I) ? 1 : 0;
@@ -2143,38 +2169,26 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
   if (TII->isXDLWMMA(*MI)) {
     for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
       Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
-      DenseSet<const MachineBasicBlock *> Visited;
-      // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+      // 'getWaitStatesSince' returns the number of VALUs in between if hazard
       // exists, and INT_MAX if there is no hazard. As a result, a negative
       // WaitStatesNeeded here means no hazard, and we will continue to search
       // for other categories.
       WaitStatesNeeded =
-          Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
-                                       std::next(MI->getReverseIterator()), 0,
-                                       IsExpiredFn, Visited, GetWaitStatesFn);
+          Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
     }
   } else { // Must be a co-executable VALU.
     for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
       Limit = VALUWaitStates[Category]; // for IsExpiredFn.
-      DenseSet<const MachineBasicBlock *> Visited;
-      // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+      // 'getWaitStatesSince' returns the number of VALUs in between if hazard
       // exists, and INT_MAX if there is no hazard. As a result, a negative
       // WaitStatesNeeded here means no hazard, and we will continue to search
       // for other categories.
       WaitStatesNeeded =
-          Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
-                                       std::next(MI->getReverseIterator()), 0,
-                                       IsExpiredFn, Visited, GetWaitStatesFn);
+          Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
     }
   }
 
-  // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
-  // means not needed.
-  for (int i = 0; i < WaitStatesNeeded; i++)
-    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-            TII->get(AMDGPU::V_NOP_e32));
-
-  return true;
+  return WaitStatesNeeded;
 }
 
 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
@@ -2204,16 +2218,33 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
     return false;
 
-  MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
-  bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
-  bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
-  bool Overlapped = OverlappedSrc || OverlappedDst;
-
-  assert(!OverlappedDst || !OverlappedSrc ||
-         Src1->getReg() == MI->getOperand(0).getReg());
   assert(ST.needsAlignedVGPRs());
   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
 
+  const DebugLoc &DL = MI->getDebugLoc();
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
+
+  // In:
+  //
+  //    Dst = shiftrev64 Amt, Src1
+  //
+  // if  Dst!=Src1 then avoid the bug with:
+  //
+  //    Dst.sub0 = Amt
+  //    Dst = shift64 Dst.sub0, Src1
+
+  Register DstReg = MI->getOperand(0).getReg();
+  if (!Src1->isReg() || Src1->getReg() != DstReg) {
+    Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
+    runOnInstruction(
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstLo).add(*Amt));
+    Amt->setReg(DstLo);
+    Amt->setIsKill(true);
+    return true;
+  }
+
+  bool Overlapped = MI->modifiesRegister(AmtReg, &TRI);
   Register NewReg;
   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
                                    : AMDGPU::VGPR_32RegClass) {
@@ -2230,8 +2261,6 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (Overlapped)
     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
 
-  DebugLoc DL = MI->getDebugLoc();
-  MachineBasicBlock *MBB = MI->getParent();
   // Insert a full wait count because found register might be pending a wait.
   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
       .addImm(0);
@@ -2269,9 +2298,8 @@ bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   Amt->setIsKill(false);
   // We do not update liveness, so verifier may see it as undef.
   Amt->setIsUndef();
-  if (OverlappedDst)
+  if (Overlapped) {
     MI->getOperand(0).setReg(NewReg);
-  if (OverlappedSrc) {
     Src1->setReg(NewReg);
     Src1->setIsKill(false);
     Src1->setIsUndef();
@@ -3267,29 +3295,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
     return false;
   assert(!ST.hasExtendedWaitCounts());
 
-  if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
+  if (!ST.isWave64())
+    return false;
+
+  const bool IsSALU = SIInstrInfo::isSALU(*MI);
+  const bool IsVALU = SIInstrInfo::isVALU(*MI);
+  if (!IsSALU && !IsVALU)
     return false;
 
   // The hazard sequence is three instructions:
   //   1. VALU reads SGPR as mask
-  //   2. SALU writes SGPR
-  //   3. SALU reads SGPR
-  // The hazard can expire if the distance between 2 and 3 is sufficient.
-  // In practice this happens <10% of the time, hence this always assumes
-  // the hazard exists if 1 and 2 are present to avoid searching.
+  //   2. VALU/SALU writes SGPR
+  //   3. VALU/SALU reads SGPR
+  // The hazard can expire if the distance between 2 and 3 is sufficient,
+  // or (2) is VALU and (3) is SALU.
+  // In practice this happens <10% of the time, hence always assume the hazard
+  // exists if (1) and (2) are present to avoid searching all SGPR reads.
 
-  const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
-  if (!SDSTOp || !SDSTOp->isReg())
-    return false;
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  auto IgnoreableSGPR = [](const Register Reg) {
+    switch (Reg) {
+    case AMDGPU::EXEC:
+    case AMDGPU::EXEC_LO:
+    case AMDGPU::EXEC_HI:
+    case AMDGPU::M0:
+    case AMDGPU::SGPR_NULL:
+    case AMDGPU::SGPR_NULL64:
+    case AMDGPU::SCC:
+      return true;
+    default:
+      return false;
+    }
+  };
+  auto IsVCC = [](const Register Reg) {
+    return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
+  };
 
-  const Register HazardReg = SDSTOp->getReg();
-  if (HazardReg == AMDGPU::EXEC ||
-      HazardReg == AMDGPU::EXEC_LO ||
-      HazardReg == AMDGPU::EXEC_HI ||
-      HazardReg == AMDGPU::M0)
+  struct StateType {
+    SmallSet<Register, 2> HazardSGPRs;
+
+    static unsigned getHashValue(const StateType &State) {
+      return hash_combine_range(State.HazardSGPRs);
+    }
+    static bool isEqual(const StateType &LHS, const StateType &RHS) {
+      return LHS.HazardSGPRs == RHS.HazardSGPRs;
+    }
+  };
+
+  SmallVector<const MachineInstr *> WaitInstrs;
+  bool HasSGPRRead = false;
+  StateType InitialState;
+
+  // Look for SGPR write.
+  MachineOperand *HazardDef = nullptr;
+  for (MachineOperand &Op : MI->operands()) {
+    if (!Op.isReg())
+      continue;
+    if (Op.isDef() && HazardDef)
+      continue;
+
+    Register Reg = Op.getReg();
+    if (IgnoreableSGPR(Reg))
+      continue;
+    if (!IsVCC(Reg)) {
+      if (Op.isImplicit())
+        continue;
+      if (!TRI->isSGPRReg(MRI, Reg))
+        continue;
+    }
+    // Also check for SGPR reads.
+    if (Op.isUse()) {
+      HasSGPRRead = true;
+      continue;
+    }
+
+    assert(!HazardDef);
+    HazardDef = &Op;
+  }
+
+  if (!HazardDef)
     return false;
 
-  auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
+  // Setup to track writes to individual SGPRs
+  const Register HazardReg = HazardDef->getReg();
+  if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
+    InitialState.HazardSGPRs.insert(HazardReg);
+  } else {
+    assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
+    InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
+    InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
+  }
+
+  auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
+    if (State.HazardSGPRs.empty())
+      return HazardExpired;
+
     switch (I.getOpcode()) {
     case AMDGPU::V_ADDC_U32_e32:
     case AMDGPU::V_ADDC_U32_dpp:
@@ -3304,11 +3406,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
     case AMDGPU::V_SUBB_U32_e32:
     case AMDGPU::V_SUBB_U32_dpp:
     case AMDGPU::V_SUBBREV_U32_e32:
-    case AMDGPU::V_SUBBREV_U32_dpp:
+    case AMDGPU::V_SUBBREV_U32_dpp: {
       // These implicitly read VCC as mask source.
-      return HazardReg == AMDGPU::VCC ||
-             HazardReg == AMDGPU::VCC_LO ||
-             HazardReg == AMDGPU::VCC_HI;
+      return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
+    }
     case AMDGPU::V_ADDC_U32_e64:
     case AMDGPU::V_ADDC_U32_e64_dpp:
     case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3324,68 +3425,110 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
       // Only check mask register overlaps.
       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
       assert(SSRCOp);
-      return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
+      bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
+      return Result ? HazardFound : NoHazardFound;
     }
     default:
-      return false;
+      return NoHazardFound;
     }
   };
 
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
-    // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
-    if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
-        AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
-      return true;
-
-    // VALU access to any SGPR or literal constant other than HazardReg
-    // mitigates hazard. No need to check HazardReg here as this will
-    // only be called when !IsHazardFn.
-    if (!SIInstrInfo::isVALU(I))
-      return false;
-    for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
-      const MachineOperand &Op = I.getOperand(OpNo);
-      if (Op.isReg()) {
-        Register OpReg = Op.getReg();
-        // Only consider uses
-        if (!Op.isUse())
+  const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
+      AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0, ST),
+                                        0),
+      0);
+  auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
+    switch (I.getOpcode()) {
+    case AMDGPU::S_WAITCNT_DEPCTR:
+      // Record mergable waits within region of instructions free of SGPR reads.
+      if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
+          (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
+        WaitInstrs.push_back(&I);
+      break;
+    default:
+      // Update tracking of SGPR reads and writes.
+      for (auto &Op : I.operands()) {
+        if (!Op.isReg())
           continue;
-        // Ignore EXEC
-        if (OpReg == AMDGPU::EXEC ||
-            OpReg == AMDGPU::EXEC_LO ||
-            OpReg == AMDGPU::EXEC_HI)
+
+        Register Reg = Op.getReg();
+        if (IgnoreableSGPR(Reg))
           continue;
-        // Ignore all implicit uses except VCC
-        if (Op.isImplicit()) {
-          if (OpReg == AMDGPU::VCC ||
-              OpReg == AMDGPU::VCC_LO ||
-              OpReg == AMDGPU::VCC_HI)
-            return true;
+        if (!IsVCC(Reg)) {
+          if (Op.isImplicit())
+            continue;
+          if (!TRI->isSGPRReg(MRI, Reg))
+            continue;
+        }
+        if (Op.isUse()) {
+          HasSGPRRead = true;
           continue;
         }
-        if (TRI.isSGPRReg(MRI, OpReg))
-          return true;
-      } else {
-        const MCInstrDesc &InstDesc = I.getDesc();
-        const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
-        if (!TII.isInlineConstant(Op, OpInfo))
-          return true;
+
+        // Stop tracking any SGPRs with writes on the basis that they will
+        // already have an appropriate wait inserted afterwards.
+        SmallVector<Register, 2> Found;
+        for (Register SGPR : State.HazardSGPRs) {
+          if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
+            Found.push_back(SGPR);
+        }
+        for (Register SGPR : Found)
+          State.HazardSGPRs.erase(SGPR);
       }
+      break;
     }
-    return false;
   };
 
   // Check for hazard
-  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
-      std::numeric_limits<int>::max())
+  if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
+                            MI->getParent(),
+                            std::next(MI->getReverseIterator())))
     return false;
 
-  auto NextMI = std::next(MI->getIterator());
+  // Compute counter mask
+  unsigned DepCtr =
+      IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
+                                 : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
+             : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
+
+  // Try to merge previous waits into this one for regions with no SGPR reads.
+  if (!WaitInstrs.empty()) {
+    // Note: WaitInstrs contains const pointers, so walk backward from MI to
+    // obtain a mutable pointer to each instruction to be merged.
+    // This is expected to be a very short walk within the same block.
+    SmallVector<MachineInstr *> ToErase;
+    unsigned Found = 0;
+    for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
+                                             End = MI->getParent()->rend();
+         Found < WaitInstrs.size() && It != End; ++It) {
+      MachineInstr *WaitMI = &*It;
+      // Find next wait instruction.
+      if (std::as_const(WaitMI) != WaitInstrs[Found])
+        continue;
+      Found++;
+      unsigned WaitMask = WaitMI->getOperand(0).getImm();
+      assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
+      DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
+          DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
+                           AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
+      DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
+          DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
+                           AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
+      DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
+          DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
+                           AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
+      ToErase.push_back(WaitMI);
+    }
+    assert(Found == WaitInstrs.size());
+    for (MachineInstr *WaitMI : ToErase)
+      WaitMI->eraseFromParent();
+  }
 
-  // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
+  // Add s_waitcnt_depctr after SGPR write.
+  auto NextMI = std::next(MI->getIterator());
   auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
                        TII.get(AMDGPU::S_WAITCNT_DEPCTR))
-                   .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+                   .addImm(DepCtr);
 
   // SALU write may be s_getpc in a bundle.
   updateGetPCBundle(NewMI);
@@ -3531,10 +3674,10 @@ bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(0xFFE3);
+      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
   BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),
           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(0xFFE3);
+      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
 
   return true;
 }
@@ -3611,7 +3754,7 @@ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
       .addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
-          AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+          AMDGPU::DepCtr::encodeFieldSaSdst(0, ST), 0));
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 67beffa..d725134 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -32,6 +32,8 @@ class GCNSubtarget;
 class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
 public:
   typedef function_ref<bool(const MachineInstr &)> IsHazardFn;
+  typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
+  typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
 
 private:
   // Distinguish if we are called from scheduler or hazard recognizer
@@ -74,6 +76,8 @@ private:
   // used on a newly inserted instruction before returning from PreEmitNoops.
   void runOnInstruction(MachineInstr *MI);
 
+  int getWaitStatesSince(IsHazardFn IsHazard, int Limit,
+                         GetNumWaitStatesFn GetNumWaitStates);
   int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
   int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
   int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
@@ -94,6 +98,9 @@ private:
   int checkReadM0Hazards(MachineInstr *SMovRel);
   int checkNSAtoVMEMHazard(MachineInstr *MI);
   int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
+  // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we
+  // need to insert, negative means not needed.
+  bool emitVNops(MachineInstr *MI, int WaitStatesNeeded);
   void fixHazards(MachineInstr *MI);
   bool fixVcmpxPermlaneHazards(MachineInstr *MI);
   bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
@@ -106,7 +113,7 @@ private:
   bool fixVALUTransUseHazard(MachineInstr *MI);
   bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
   bool fixWMMAHazards(MachineInstr *MI);
-  bool fixWMMACoexecutionHazards(MachineInstr *MI);
+  int checkWMMACoexecutionHazards(MachineInstr *MI);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
   bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index f253a84..dff153c 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -381,10 +381,14 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
   auto Top = R.Begin;
   for (const auto &I : Schedule) {
     auto MI = getMachineInstr(I);
-    if (MI != &*Top) {
+
+    MachineBasicBlock::iterator MII = MI->getIterator();
+    if (MII != Top) {
+      bool NonDebugReordered =
+          !MI->isDebugInstr() && skipDebugInstructionsForward(Top, MII) != MII;
       BB->remove(MI);
       BB->insert(Top, MI);
-      if (!MI->isDebugInstr())
+      if (NonDebugReordered)
         LIS->handleMove(*MI, true);
     }
     if (!MI->isDebugInstr()) {
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 959ce69..5529808 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -43,7 +43,7 @@ public:
   bool run(MachineFunction &MF);
 
 private:
-  using NSA_Status = enum {
+  enum NSA_Status {
     NOT_NSA,        // Not an NSA instruction
     FIXED,          // NSA which we cannot modify
     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
@@ -81,9 +81,7 @@ class GCNNSAReassignLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  GCNNSAReassignLegacy() : MachineFunctionPass(ID) {
-    initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  GCNNSAReassignLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
index 355bbeb..5e9ac56 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
@@ -57,9 +57,7 @@ public:
 class GCNPreRALongBranchRegLegacy : public MachineFunctionPass {
 public:
   static char ID;
-  GCNPreRALongBranchRegLegacy() : MachineFunctionPass(ID) {
-    initializeGCNPreRALongBranchRegLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  GCNPreRALongBranchRegLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     return GCNPreRALongBranchReg().run(MF);
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9..cd56887 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -63,9 +63,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
-    initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -136,7 +134,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
           continue;
 
         if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
-          MachineOperand DefSrcMO = Def.getOperand(1);
+          const MachineOperand &DefSrcMO = Def.getOperand(1);
 
           // Immediates are not an issue and can be propagated in
           // postrapseudos pass. Only handle cases where defining
@@ -270,15 +268,14 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
         continue;
       Register Dst = MI.getOperand(0).getReg();
       Register Src = MI.getOperand(1).getReg();
-      if (Dst.isVirtual() &&
-          MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          Src.isPhysical() &&
+      const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, Dst);
+      bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(DstRC);
+      if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() &&
           TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
         MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
       if (Src.isVirtual() &&
           MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
-          Dst.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+          Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass)
         MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
       if (!Dst.isVirtual() || !Src.isVirtual())
         continue;
@@ -287,8 +284,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
         MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
         MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
       }
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+      if (IsDst16Bit && MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
         MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index c6fb31f..9949208 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -304,6 +304,10 @@ def : ProcessorModel<"gfx1153", GFX11SpeedModel,
   FeatureISAVersion11_5_3.Features
 >;
 
+def : ProcessorModel<"gfx1170", GFX11SpeedModel,
+  FeatureISAVersion11_7_0.Features
+>;
+
 // [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153]
 def : ProcessorModel<"gfx11-generic", GFX11SpeedModel,
   FeatureISAVersion11_Generic.Features
@@ -333,3 +337,11 @@ def : ProcessorModel<"gfx1250", GFX1250SpeedModel,
 def : ProcessorModel<"gfx1251", GFX1250SpeedModel,
   FeatureISAVersion12_51.Features
 >;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX13.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx1310", GFX12SpeedModel,
+  FeatureISAVersion13.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 4e11c4f..89307ef 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -97,6 +97,51 @@ void GCNRegPressure::inc(unsigned Reg,
   Value[RegKind] += Sign;
 }
 
+namespace {
+struct RegExcess {
+  unsigned SGPR = 0;
+  unsigned VGPR = 0;
+  unsigned ArchVGPR = 0;
+  unsigned AGPR = 0;
+
+  bool anyExcess() const { return SGPR || VGPR || ArchVGPR || AGPR; }
+  bool hasVectorRegisterExcess() const { return VGPR || ArchVGPR || AGPR; }
+
+  RegExcess(const MachineFunction &MF, const GCNRegPressure &RP)
+      : RegExcess(MF, RP, GCNRPTarget(MF, RP)) {}
+  RegExcess(const MachineFunction &MF, const GCNRegPressure &RP,
+            const GCNRPTarget &Target) {
+    unsigned MaxSGPRs = Target.getMaxSGPRs();
+    unsigned MaxVGPRs = Target.getMaxVGPRs();
+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    SGPR = std::max(static_cast<int>(RP.getSGPRNum() - MaxSGPRs), 0);
+
+    // The number of virtual VGPRs required to handle excess SGPR
+    unsigned WaveSize = ST.getWavefrontSize();
+    unsigned VGPRForSGPRSpills = divideCeil(SGPR, WaveSize);
+
+    unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+
+    // Unified excess pressure conditions, accounting for VGPRs used for SGPR
+    // spills
+    VGPR = std::max(static_cast<int>(RP.getVGPRNum(ST.hasGFX90AInsts()) +
+                                     VGPRForSGPRSpills - MaxVGPRs),
+                    0);
+
+    unsigned ArchVGPRLimit = ST.hasGFX90AInsts() ? MaxArchVGPRs : MaxVGPRs;
+    // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
+    // spills
+    ArchVGPR = std::max(static_cast<int>(RP.getArchVGPRNum() +
+                                         VGPRForSGPRSpills - ArchVGPRLimit),
+                        0);
+
+    // AGPR excess pressure conditions
+    AGPR = std::max(static_cast<int>(RP.getAGPRNum() - ArchVGPRLimit), 0);
+  }
+};
+} // namespace
+
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -123,63 +168,25 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
     return Occ > OtherOcc;
 
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
-  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
-  // SGPR excess pressure conditions
-  unsigned ExcessSGPR = std::max(static_cast<int>(getSGPRNum() - MaxSGPRs), 0);
-  unsigned OtherExcessSGPR =
-      std::max(static_cast<int>(O.getSGPRNum() - MaxSGPRs), 0);
-
-  auto WaveSize = ST.getWavefrontSize();
-  // The number of virtual VGPRs required to handle excess SGPR
-  unsigned VGPRForSGPRSpills = (ExcessSGPR + (WaveSize - 1)) / WaveSize;
-  unsigned OtherVGPRForSGPRSpills =
-      (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
+  RegExcess Excess(MF, *this);
+  RegExcess OtherExcess(MF, O);
 
   unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
 
-  // Unified excess pressure conditions, accounting for VGPRs used for SGPR
-  // spills
-  unsigned ExcessVGPR =
-      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
-                                VGPRForSGPRSpills - MaxVGPRs),
-               0);
-  unsigned OtherExcessVGPR =
-      std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
-                                OtherVGPRForSGPRSpills - MaxVGPRs),
-               0);
-  // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
-  // spills
-  unsigned ExcessArchVGPR = std::max(
-      static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
-      0);
-  unsigned OtherExcessArchVGPR =
-      std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
-                                MaxArchVGPRs),
-               0);
-  // AGPR excess pressure conditions
-  unsigned ExcessAGPR = std::max(
-      static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
-                                           : (getAGPRNum() - MaxVGPRs)),
-      0);
-  unsigned OtherExcessAGPR = std::max(
-      static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
-                                           : (O.getAGPRNum() - MaxVGPRs)),
-      0);
-
-  bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
-  bool OtherExcessRP = OtherExcessSGPR || OtherExcessVGPR ||
-                       OtherExcessArchVGPR || OtherExcessAGPR;
+  bool ExcessRP = Excess.anyExcess();
+  bool OtherExcessRP = OtherExcess.anyExcess();
 
   // Give second precedence to the reduced number of spills to hold the register
   // pressure.
   if (ExcessRP || OtherExcessRP) {
     // The difference in excess VGPR pressure, after including VGPRs used for
     // SGPR spills
-    int VGPRDiff = ((OtherExcessVGPR + OtherExcessArchVGPR + OtherExcessAGPR) -
-                    (ExcessVGPR + ExcessArchVGPR + ExcessAGPR));
+    int VGPRDiff =
+        ((OtherExcess.VGPR + OtherExcess.ArchVGPR + OtherExcess.AGPR) -
+         (Excess.VGPR + Excess.ArchVGPR + Excess.AGPR));
 
-    int SGPRDiff = OtherExcessSGPR - ExcessSGPR;
+    int SGPRDiff = OtherExcess.SGPR - Excess.SGPR;
 
     if (VGPRDiff != 0)
       return VGPRDiff > 0;
@@ -282,11 +289,12 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
 
     Register Reg = MO.getReg();
     auto I = llvm::find_if(VRegMaskOrUnits, [Reg](const VRegMaskOrUnit &RM) {
-      return RM.RegUnit == Reg;
+      return RM.VRegOrUnit.asVirtualReg() == Reg;
     });
 
     auto &P = I == VRegMaskOrUnits.end()
-                  ? VRegMaskOrUnits.emplace_back(Reg, LaneBitmask::getNone())
+                  ? VRegMaskOrUnits.emplace_back(VirtRegOrUnit(Reg),
+                                                 LaneBitmask::getNone())
                   : *I;
 
     P.LaneMask |= MO.getSubReg() ? TRI.getSubRegIndexLaneMask(MO.getSubReg())
@@ -295,7 +303,7 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
 
   SlotIndex InstrSI;
   for (auto &P : VRegMaskOrUnits) {
-    auto &LI = LIS.getInterval(P.RegUnit);
+    auto &LI = LIS.getInterval(P.VRegOrUnit.asVirtualReg());
     if (!LI.hasSubRanges())
       continue;
 
@@ -312,29 +320,22 @@ collectVirtualRegUses(SmallVectorImpl<VRegMaskOrUnit> &VRegMaskOrUnits,
 /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
 static LaneBitmask getLanesWithProperty(
     const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
-    bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
-    LaneBitmask SafeDefault,
+    bool TrackLaneMasks, Register Reg, SlotIndex Pos,
     function_ref<bool(const LiveRange &LR, SlotIndex Pos)> Property) {
-  if (RegUnit.isVirtual()) {
-    const LiveInterval &LI = LIS.getInterval(RegUnit);
-    LaneBitmask Result;
-    if (TrackLaneMasks && LI.hasSubRanges()) {
-      for (const LiveInterval::SubRange &SR : LI.subranges()) {
-        if (Property(SR, Pos))
-          Result |= SR.LaneMask;
-      }
-    } else if (Property(LI, Pos)) {
-      Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
-                              : LaneBitmask::getAll();
+  assert(Reg.isVirtual());
+  const LiveInterval &LI = LIS.getInterval(Reg);
+  LaneBitmask Result;
+  if (TrackLaneMasks && LI.hasSubRanges()) {
+    for (const LiveInterval::SubRange &SR : LI.subranges()) {
+      if (Property(SR, Pos))
+        Result |= SR.LaneMask;
     }
-
-    return Result;
+  } else if (Property(LI, Pos)) {
+    Result =
+        TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(Reg) : LaneBitmask::getAll();
   }
 
-  const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
-  if (LR == nullptr)
-    return SafeDefault;
-  return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+  return Result;
 }
 
 /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
@@ -412,15 +413,15 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
   const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
 
+  RegExcess Excess(MF, RP, *this);
+
   if (SRI->isSGPRClass(RC))
-    return RP.getSGPRNum() > MaxSGPRs;
-  unsigned NumVGPRs =
-      SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
-  // The addressable limit must always be respected.
-  if (NumVGPRs > MaxVGPRs)
-    return true;
-  // For unified RFs, combined VGPR usage limit must be respected as well.
-  return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
+    return Excess.SGPR;
+
+  if (SRI->isAGPRClass(RC))
+    return (UnifiedRF && Excess.VGPR) || Excess.AGPR;
+
+  return (UnifiedRF && Excess.VGPR) || Excess.ArchVGPR;
 }
 
 bool GCNRPTarget::satisfied() const {
@@ -431,6 +432,11 @@ bool GCNRPTarget::satisfied() const {
   return true;
 }
 
+bool GCNRPTarget::hasVectorRegisterExcess() const {
+  RegExcess Excess(MF, RP, *this);
+  return Excess.hasVectorRegisterExcess();
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRPTracker
 
@@ -502,10 +508,9 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_,
 }
 
 /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
-LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit,
-                                           SlotIndex Pos) const {
+LaneBitmask GCNRPTracker::getLastUsedLanes(Register Reg, SlotIndex Pos) const {
   return getLanesWithProperty(
-      LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(),
+      LIS, *MRI, true, Reg, Pos.getBaseIndex(),
       [](const LiveRange &LR, SlotIndex Pos) {
         const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
         return S != nullptr && S->end == Pos.getRegSlot();
@@ -562,10 +567,10 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   SmallVector<VRegMaskOrUnit, 8> RegUses;
   collectVirtualRegUses(RegUses, MI, LIS, *MRI);
   for (const VRegMaskOrUnit &U : RegUses) {
-    LaneBitmask &LiveMask = LiveRegs[U.RegUnit];
+    LaneBitmask &LiveMask = LiveRegs[U.VRegOrUnit.asVirtualReg()];
     LaneBitmask PrevMask = LiveMask;
     LiveMask |= U.LaneMask;
-    CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
+    CurPressure.inc(U.VRegOrUnit.asVirtualReg(), PrevMask, LiveMask, *MRI);
   }
 
   // Update MaxPressure with uses plus early-clobber defs pressure.
@@ -580,7 +585,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
 
 bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
                                  const LiveRegSet *LiveRegsCopy) {
-  MRI = &MI.getParent()->getParent()->getRegInfo();
+  MRI = &MI.getMF()->getRegInfo();
   LastTrackedMI = nullptr;
   MBBEnd = MI.getParent()->end();
   NextMI = &MI;
@@ -748,9 +753,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
   GCNRegPressure TempPressure = CurPressure;
 
   for (const VRegMaskOrUnit &Use : RegOpers.Uses) {
-    Register Reg = Use.RegUnit;
-    if (!Reg.isVirtual())
+    if (!Use.VRegOrUnit.isVirtualReg())
       continue;
+    Register Reg = Use.VRegOrUnit.asVirtualReg();
     LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
     if (LastUseMask.none())
       continue;
@@ -782,9 +787,9 @@ GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI,
 
   // Generate liveness for defs.
   for (const VRegMaskOrUnit &Def : RegOpers.Defs) {
-    Register Reg = Def.RegUnit;
-    if (!Reg.isVirtual())
+    if (!Def.VRegOrUnit.isVirtualReg())
       continue;
+    Register Reg = Def.VRegOrUnit.asVirtualReg();
     auto It = LiveRegs.find(Reg);
     LaneBitmask LiveMask = It != LiveRegs.end() ? It->second : LaneBitmask(0);
     LaneBitmask NewMask = LiveMask | Def.LaneMask;
@@ -824,8 +829,7 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
       Register Reg = Register::index2VirtReg(I);
       auto It = LiveRegs.find(Reg);
       if (It != LiveRegs.end() && It->second.any())
-        OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
-           << PrintLaneMask(It->second);
+        OS << ' ' << printReg(Reg, TRI) << ':' << PrintLaneMask(It->second);
     }
     OS << '\n';
   });
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 979a8b0..c55796c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include <algorithm>
+#include <array>
 
 namespace llvm {
 
@@ -45,7 +46,7 @@ struct GCNRegPressure {
     return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
   }
 
-  void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+  void clear() { Value.fill(0); }
 
   unsigned getNumRegs(RegKind Kind) const {
     assert(Kind < TOTAL_KINDS);
@@ -101,6 +102,29 @@ struct GCNRegPressure {
                                                 DynamicVGPRBlockSize));
   }
 
+  unsigned getVGPRSpills(MachineFunction &MF, unsigned ArchVGPRThreshold,
+                         unsigned AGPRThreshold, unsigned CombinedThreshold) {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.hasGFX90AInsts())
+      return 0;
+
+    unsigned ArchPressure = getArchVGPRNum();
+    unsigned AGPRPressure = getAGPRNum();
+
+    unsigned ArchSpill = ArchPressure > ArchVGPRThreshold
+                             ? (ArchPressure - ArchVGPRThreshold)
+                             : 0;
+    unsigned AGPRSpill =
+        AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
+
+    unsigned UnifiedPressure = getVGPRNum(/*UnifiedVGPRFile=*/true);
+    unsigned UnifiedSpill = UnifiedPressure > CombinedThreshold
+                                ? (UnifiedPressure - CombinedThreshold)
+                                : 0;
+
+    return std::max(UnifiedSpill, ArchSpill + AGPRSpill);
+  }
+
   void inc(unsigned Reg,
            LaneBitmask PrevMask,
            LaneBitmask NewMask,
@@ -127,9 +151,7 @@ struct GCNRegPressure {
   bool less(const MachineFunction &MF, const GCNRegPressure &O,
             unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
-  bool operator==(const GCNRegPressure &O) const {
-    return std::equal(&Value[0], &Value[ValueArraySize], O.Value);
-  }
+  bool operator==(const GCNRegPressure &O) const { return Value == O.Value; }
 
   bool operator!=(const GCNRegPressure &O) const {
     return !(*this == O);
@@ -160,7 +182,7 @@ private:
 
   /// Pressure for all register kinds (first all regular registers kinds, then
   /// all tuple register kinds).
-  unsigned Value[ValueArraySize];
+  std::array<unsigned, ValueArraySize> Value;
 
   static unsigned getRegKind(const TargetRegisterClass *RC,
                              const SIRegisterInfo *STI);
@@ -235,6 +257,12 @@ public:
 
   /// Whether the current RP is at or below the defined pressure target.
   bool satisfied() const;
+  bool hasVectorRegisterExcess() const;
+
+  unsigned getMaxSGPRs() const { return MaxSGPRs; }
+  unsigned getMaxVGPRs() const {
+    return UnifiedRF ? MaxUnifiedVGPRs : MaxVGPRs;
+  }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) {
@@ -293,7 +321,7 @@ protected:
   /// Mostly copy/paste from CodeGen/RegisterPressure.cpp
   void bumpDeadDefs(ArrayRef<VRegMaskOrUnit> DeadDefs);
 
-  LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const;
+  LaneBitmask getLastUsedLanes(Register Reg, SlotIndex Pos) const;
 
 public:
   // reset tracker and set live register set to the specified value.
@@ -456,7 +484,7 @@ template <typename Range>
 DenseMap<MachineInstr*, GCNRPTracker::LiveRegSet>
 getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
   std::vector<SlotIndex> Indexes;
-  Indexes.reserve(std::distance(R.begin(), R.end()));
+  Indexes.reserve(llvm::size(R));
   auto &SII = *LIS.getSlotIndexes();
   for (MachineInstr *I : R) {
     auto SI = SII.getInstructionIndex(*I);
@@ -464,7 +492,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
   }
   llvm::sort(Indexes);
 
-  auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
+  auto &MRI = (*R.begin())->getMF()->getRegInfo();
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
   SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
@@ -494,13 +522,13 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
 inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
                                                  const LiveIntervals &LIS) {
   return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
-                     MI.getParent()->getParent()->getRegInfo());
+                     MI.getMF()->getRegInfo());
 }
 
 inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
                                                   const LiveIntervals &LIS) {
   return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
-                     MI.getParent()->getParent()->getRegInfo());
+                     MI.getMF()->getRegInfo());
 }
 
 template <typename Range>
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9fbf9e5..b044195 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -28,10 +28,19 @@
 #include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #define DEBUG_TYPE "machine-scheduler"
@@ -90,6 +99,10 @@ static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
     cl::init(false));
 #endif
 
+static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
+    "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
+    cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true));
+
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -145,7 +158,6 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
-
   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
                     << ", VGPRExcessLimit = " << VGPRExcessLimit
                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
@@ -690,6 +702,8 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+  if (!DisableRewriteMFMAFormSchedStage)
+    SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -946,6 +960,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
   switch (SchedStageID) {
   case GCNSchedStageID::OccInitialSchedule:
     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+  case GCNSchedStageID::RewriteMFMAForm:
+    return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -970,6 +986,8 @@ void GCNScheduleDAGMILive::schedule() {
 
 GCNRegPressure
 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
+  if (Regions[RegionIdx].first == Regions[RegionIdx].second)
+    return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
   GCNDownwardRPTracker RPTracker(*LIS);
   RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
                     &LiveIns[RegionIdx]);
@@ -978,10 +996,8 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
 
 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
                                         MachineBasicBlock::iterator RegionEnd) {
-  auto REnd = RegionEnd == RegionBegin->getParent()->end()
-                  ? std::prev(RegionEnd)
-                  : RegionEnd;
-  return &*skipDebugInstructionsBackward(REnd, RegionBegin);
+  assert(RegionBegin != RegionEnd && "Region must not be empty");
+  return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
 }
 
 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
@@ -1076,9 +1092,12 @@ GCNScheduleDAGMILive::getRegionLiveOutMap() const {
   assert(!Regions.empty());
   std::vector<MachineInstr *> RegionLastMIs;
   RegionLastMIs.reserve(Regions.size());
-  for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
+  for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
+    // Skip empty regions.
+    if (RegionBegin == RegionEnd)
+      continue;
     RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
-
+  }
   return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
 }
 
@@ -1088,10 +1107,12 @@ void RegionPressureMap::buildLiveRegMap() {
   RegionLiveRegMap =
       IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
   for (unsigned I = 0; I < DAG->Regions.size(); I++) {
+    auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
+    // Skip empty regions.
+    if (RegionBegin == RegionEnd)
+      continue;
     MachineInstr *RegionKey =
-        IsLiveOut
-            ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
-            : &*DAG->Regions[I].first;
+        IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
     IdxToInstruction[I] = RegionKey;
   }
 }
@@ -1160,6 +1181,8 @@ void GCNScheduleDAGMILive::runSchedStages() {
 
       ScheduleDAGMILive::schedule();
       Stage->finalizeGCNRegion();
+      Stage->advanceRegion();
+      exitRegion();
     }
 
     Stage->finalizeGCNSchedStage();
@@ -1180,6 +1203,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   case GCNSchedStageID::OccInitialSchedule:
     OS << "Max Occupancy Initial Schedule";
     break;
+  case GCNSchedStageID::RewriteMFMAForm:
+    OS << "Instruction Rewriting Reschedule";
+    break;
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     OS << "Unclustered High Register Pressure Reschedule";
     break;
@@ -1213,6 +1239,107 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
+void RewriteMFMAFormStage::findReachingDefs(
+    MachineOperand &UseMO, LiveIntervals *LIS,
+    SmallVectorImpl<SlotIndex> &DefIdxs) {
+  MachineInstr *UseMI = UseMO.getParent();
+  LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
+  VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
+
+  // If the def is not a PHI, then it must be the only reaching def.
+  if (!VNI->isPHIDef()) {
+    DefIdxs.push_back(VNI->def);
+    return;
+  }
+
+  SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
+  SmallVector<MachineBasicBlock *, 8> Worklist;
+
+  // Mark the predecessor blocks for traversal
+  for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
+    Worklist.push_back(PredMBB);
+    Visited.insert(PredMBB);
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
+
+    SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
+    VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
+
+    MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
+
+    // If there is a def in this block, then add it to the list. This is the
+    // reaching def of this path.
+    if (!VNI->isPHIDef()) {
+      DefIdxs.push_back(VNI->def);
+      continue;
+    }
+
+    for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
+      if (Visited.insert(PredMBB).second)
+        Worklist.push_back(PredMBB);
+    }
+  }
+}
+
+void RewriteMFMAFormStage::findReachingUses(
+    MachineInstr *DefMI, LiveIntervals *LIS,
+    SmallVectorImpl<MachineOperand *> &ReachingUses) {
+  SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
+  for (MachineOperand &UseMO :
+       DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
+    SmallVector<SlotIndex, 8> ReachingDefIndexes;
+    findReachingDefs(UseMO, LIS, ReachingDefIndexes);
+
+    // If we find a use that contains this DefMI in its reachingDefs, then it is
+    // a reaching use.
+    if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
+          return SlotIndex::isSameInstr(RDIdx, DefIdx);
+        }))
+      ReachingUses.push_back(&UseMO);
+  }
+}
+
+bool RewriteMFMAFormStage::initGCNSchedStage() {
+  // We only need to run this pass if the architecture supports AGPRs.
+  // Additionally, we don't use AGPRs at occupancy levels above 1 so there
+  // is no need for this pass in that case, either.
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
+    return false;
+
+  RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
+  RegionsWithExcessArchVGPR.reset();
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    GCNRegPressure PressureBefore = DAG.Pressure[Region];
+    if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+      RegionsWithExcessArchVGPR[Region] = true;
+  }
+
+  if (RegionsWithExcessArchVGPR.none())
+    return false;
+
+  TII = ST.getInstrInfo();
+  SRI = ST.getRegisterInfo();
+
+  std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
+  DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
+  SmallPtrSet<MachineInstr *, 8> CopyForDef;
+
+  if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
+    return false;
+
+  int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
+
+  // If we haven't found the beneficial conditions, prefer the VGPR form which
+  // may result in less cross RC copies.
+  if (Cost > 0)
+    return false;
+
+  return rewrite(RewriteCands);
+}
+
 bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (DisableUnclusterHighRP)
     return false;
@@ -1228,18 +1355,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
       createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
-  // Aggressivly try to reduce register pressure in the unclustered high RP
+  // Aggressively try to reduce register pressure in the unclustered high RP
   // stage. Temporarily increase occupancy target in the region.
+  TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
+                            ? InitialOccupancy + 1
+                            : InitialOccupancy;
+  IsAnyRegionScheduled = false;
   S.SGPRLimitBias = S.HighRPSGPRBias;
   S.VGPRLimitBias = S.HighRPVGPRBias;
-  if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
-    MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
 
   LLVM_DEBUG(
       dbgs()
       << "Retrying function scheduling without clustering. "
-         "Aggressivly try to reduce register pressure to achieve occupancy "
-      << DAG.MinOccupancy << ".\n");
+         "Aggressively try to reduce register pressure to achieve occupancy "
+      << TempTargetOccupancy << ".\n");
 
   return true;
 }
@@ -1267,33 +1396,222 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
 #define REMAT_PREFIX "[PreRARemat] "
 #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+Printable PreRARematStage::ScoredRemat::print() const {
+  return Printable([&](raw_ostream &OS) {
+    OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
+  });
+}
+#endif
+
 bool PreRARematStage::initGCNSchedStage() {
   // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
   // regions inbetween the defs and region we sinked the def to. Will need to be
   // fixed if there is another pass after this pass.
   assert(!S.hasNextStage());
 
-  if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
+  if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
     return false;
 
+  // Maps all MIs (except lone terminators, which are not part of any region) to
+  // their parent region. Non-lone terminators are considered part of the region
+  // they delimitate.
+  DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
+
   // Before performing any IR modification record the parent region of each MI
   // and the parent MBB of each region.
   const unsigned NumRegions = DAG.Regions.size();
-  RegionBB.reserve(NumRegions);
   for (unsigned I = 0; I < NumRegions; ++I) {
     RegionBoundaries Region = DAG.Regions[I];
     for (auto MI = Region.first; MI != Region.second; ++MI)
       MIRegion.insert({&*MI, I});
-    RegionBB.push_back(Region.first->getParent());
+    MachineBasicBlock *ParentMBB = Region.first->getParent();
+    if (Region.second != ParentMBB->end())
+      MIRegion.insert({&*Region.second, I});
+    RegionBB.push_back(ParentMBB);
   }
 
-  if (!canIncreaseOccupancyOrReduceSpill())
+#ifndef NDEBUG
+  auto PrintTargetRegions = [&]() -> void {
+    if (TargetRegions.none()) {
+      dbgs() << REMAT_PREFIX << "No target regions\n";
+      return;
+    }
+    dbgs() << REMAT_PREFIX << "Target regions:\n";
+    for (unsigned I : TargetRegions.set_bits())
+      dbgs() << REMAT_PREFIX << "  [" << I << "] " << RPTargets[I] << '\n';
+  };
+  auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
+    return Printable([&, Remat](raw_ostream &OS) {
+      // Concatenate all region numbers in which the register is unused and
+      // live-through.
+      bool HasLiveThroughRegion = false;
+      OS << '[' << Remat.DefRegion << " -";
+      for (unsigned I = 0; I < NumRegions; ++I) {
+        if (Remat.isUnusedLiveThrough(I)) {
+          if (HasLiveThroughRegion) {
+            OS << ',';
+          } else {
+            OS << "- ";
+            HasLiveThroughRegion = true;
+          }
+          OS << I;
+        }
+      }
+      if (HasLiveThroughRegion)
+        OS << " -";
+      OS << "-> " << Remat.UseRegion << "] ";
+      Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
+                         /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
+    });
+  };
+#endif
+
+  // Set an objective for the stage based on current RP in each region.
+  REMAT_DEBUG({
+    dbgs() << "Analyzing ";
+    MF.getFunction().printAsOperand(dbgs(), false);
+    dbgs() << ": ";
+  });
+  if (!setObjective()) {
+    LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
+                      << MFI.getMaxWavesPerEU() << '\n');
     return false;
+  }
+  LLVM_DEBUG({
+    if (TargetOcc) {
+      dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
+    } else {
+      dbgs() << "reduce spilling (minimum target occupancy is "
+             << MFI.getMinWavesPerEU() << ")\n";
+    }
+    PrintTargetRegions();
+  });
+
+  if (!collectRematRegs(MIRegion)) {
+    REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
+    return false;
+  }
+  const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
+  REMAT_DEBUG({
+    dbgs() << "Rematerializable registers:\n";
+    for (const RematReg &Remat : RematRegs)
+      dbgs() << REMAT_PREFIX << "  " << PrintRematReg(Remat) << '\n';
+    dbgs() << REMAT_PREFIX << "Region frequencies\n";
+    for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
+      dbgs() << REMAT_PREFIX << "  [" << I << "] ";
+      if (Freq)
+        dbgs() << Freq;
+      else
+        dbgs() << "unknown ";
+      dbgs() << " | " << *DAG.Regions[I].first;
+    }
+  });
+
+  SmallVector<ScoredRemat> ScoredRemats;
+  for (RematReg &Remat : RematRegs)
+    ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
+
+// Rematerialize registers in successive rounds until all RP targets are
+// satisifed or until we run out of rematerialization candidates.
+#ifndef NDEBUG
+  unsigned RoundNum = 0;
+#endif
+  BitVector RecomputeRP(NumRegions);
+  do {
+    assert(!ScoredRemats.empty() && "no more remat candidates");
+
+    // (Re-)Score and (re-)sort all remats in increasing score order.
+    for (ScoredRemat &Remat : ScoredRemats)
+      Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
+    sort(ScoredRemats);
+
+    REMAT_DEBUG({
+      dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
+             << REMAT_PREFIX
+             << "Candidates with non-null score, in rematerialization order:\n";
+      for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
+        if (RematDecision.hasNullScore())
+          break;
+        dbgs() << REMAT_PREFIX << "  " << RematDecision.print() << " | "
+               << *RematDecision.Remat->DefMI;
+      }
+      PrintTargetRegions();
+    });
+
+    RecomputeRP.reset();
+    unsigned RematIdx = ScoredRemats.size();
+
+    // Rematerialize registers in decreasing score order until we estimate
+    // that all RP targets are satisfied or until rematerialization candidates
+    // are no longer useful to decrease RP.
+    for (; RematIdx && TargetRegions.any(); --RematIdx) {
+      const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
+      // Stop rematerializing on encountering a null score. Since scores
+      // monotonically decrease as we rematerialize, we know there is nothing
+      // useful left to do in such cases, even if we were to re-score.
+      if (Candidate.hasNullScore()) {
+        RematIdx = 0;
+        break;
+      }
+
+      RematReg &Remat = *Candidate.Remat;
+      // When previous rematerializations in this round have already satisfied
+      // RP targets in all regions this rematerialization can impact, we have a
+      // good indication that our scores have diverged significantly from
+      // reality, in which case we interrupt this round and re-score. This also
+      // ensures that every rematerialization we perform is possibly impactful
+      // in at least one target region.
+      if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
+        break;
+
+      REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
+      // Every rematerialization we do here is likely to move the instruction
+      // into a higher frequency region, increasing the total sum latency of the
+      // instruction itself. This is acceptable if we are eliminating a spill in
+      // the process, but when the goal is increasing occupancy we get nothing
+      // out of rematerialization if occupancy is not increased in the end; in
+      // such cases we want to roll back the rematerialization.
+      RollbackInfo *Rollback =
+          TargetOcc ? &Rollbacks.emplace_back(&Remat) : nullptr;
+      rematerialize(Remat, RecomputeRP, Rollback);
+      unsetSatisifedRPTargets(Remat.Live);
+    }
+
+    REMAT_DEBUG({
+      if (!TargetRegions.any()) {
+        dbgs() << "** Interrupt round on all targets achieved\n";
+      } else if (RematIdx) {
+        dbgs() << "** Interrupt round on stale score for "
+               << *ScoredRemats[RematIdx - 1].Remat->DefMI;
+      } else {
+        dbgs() << "** Stop on exhausted rematerialization candidates\n";
+      }
+    });
+
+    // Peel off registers we already rematerialized from the vector's tail.
+    ScoredRemats.truncate(RematIdx);
+  } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
+           !ScoredRemats.empty());
+  if (RescheduleRegions.none())
+    return false;
+
+  // Commit all pressure changes to the DAG and compute minimum achieved
+  // occupancy in impacted regions.
+  REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
+  unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
+  for (unsigned I : RescheduleRegions.set_bits()) {
+    DAG.Pressure[I] = RPTargets[I].getCurrentRP();
+    REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
+                       << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
+                       << " (" << RPTargets[I] << ")\n");
+  }
+  AchievedOcc = MFI.getMaxWavesPerEU();
+  for (const GCNRegPressure &RP : DAG.Pressure) {
+    AchievedOcc =
+        std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+  }
 
-  // Rematerialize identified instructions and update scheduler's state.
-  rematerialize();
-  if (GCNTrackers)
-    DAG.RegionLiveOuts.buildLiveRegMap();
   REMAT_DEBUG({
     dbgs() << "Retrying function scheduling with new min. occupancy of "
            << AchievedOcc << " from rematerializing (original was "
@@ -1303,11 +1621,7 @@ bool PreRARematStage::initGCNSchedStage() {
     dbgs() << ")\n";
   });
 
-  if (AchievedOcc > DAG.MinOccupancy) {
-    DAG.MinOccupancy = AchievedOcc;
-    SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-    MFI.increaseOccupancy(MF, DAG.MinOccupancy);
-  }
+  DAG.setTargetOccupancy(getStageTargetOccupancy());
   return true;
 }
 
@@ -1320,15 +1634,26 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
   SavedMutations.swap(DAG.Mutations);
   S.SGPRLimitBias = S.VGPRLimitBias = 0;
   if (DAG.MinOccupancy > InitialOccupancy) {
+    assert(IsAnyRegionScheduled);
     LLVM_DEBUG(dbgs() << StageID
                       << " stage successfully increased occupancy to "
                       << DAG.MinOccupancy << '\n');
+  } else if (!IsAnyRegionScheduled) {
+    assert(DAG.MinOccupancy == InitialOccupancy);
+    LLVM_DEBUG(dbgs() << StageID
+                      << ": No regions scheduled, min occupancy stays at "
+                      << DAG.MinOccupancy << ", MFI occupancy stays at "
+                      << MFI.getOccupancy() << ".\n");
   }
 
   GCNSchedStage::finalizeGCNSchedStage();
 }
 
 bool GCNSchedStage::initGCNRegion() {
+  // Skip empty scheduling region.
+  if (DAG.begin() == DAG.end())
+    return false;
+
   // Check whether this new region is also a new block.
   if (DAG.RegionBegin->getParent() != CurrentMBB)
     setupNewBlock();
@@ -1336,8 +1661,8 @@ bool GCNSchedStage::initGCNRegion() {
   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
 
-  // Skip empty scheduling regions (0 or 1 schedulable instructions).
-  if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
+  // Skip regions with 1 schedulable instruction.
+  if (DAG.begin() == std::prev(DAG.end()))
     return false;
 
   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
@@ -1396,13 +1721,25 @@ bool UnclusteredHighRPStage::initGCNRegion() {
   // rescheduling of previous regions did not make occupancy drop back down to
   // the initial minimum).
   unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+  // If no region has been scheduled yet, the DAG has not yet been updated with
+  // the occupancy target. So retrieve it from the temporary.
+  unsigned CurrentTargetOccupancy =
+      IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
   if (!DAG.RegionsWithExcessRP[RegionIdx] &&
-      (DAG.MinOccupancy <= InitialOccupancy ||
+      (CurrentTargetOccupancy <= InitialOccupancy ||
        DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
            InitialOccupancy))
     return false;
 
-  return GCNSchedStage::initGCNRegion();
+  bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
+  // If this is the first region scheduled during this stage, make the target
+  // occupancy changes in the DAG and MFI.
+  if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
+    IsAnyRegionScheduled = true;
+    if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
+      DAG.setTargetOccupancy(TempTargetOccupancy);
+  }
+  return IsSchedulingThisRegion;
 }
 
 bool ClusteredLowOccStage::initGCNRegion() {
@@ -1447,9 +1784,23 @@ void GCNSchedStage::finalizeGCNRegion() {
   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
     SavedMutations.swap(DAG.Mutations);
+}
 
-  DAG.exitRegion();
-  advanceRegion();
+void PreRARematStage::finalizeGCNRegion() {
+  GCNSchedStage::finalizeGCNRegion();
+  // When the goal is to increase occupancy, all regions must reach the target
+  // occupancy for rematerializations to be possibly useful, otherwise we will
+  // just hurt latency for no benefit. If minimum occupancy drops below the
+  // target there is no point in trying to re-schedule further regions.
+  if (!TargetOcc)
+    return;
+  RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
+  if (DAG.MinOccupancy < *TargetOcc) {
+    REMAT_DEBUG(dbgs() << "Region " << RegionIdx
+                       << " cannot meet occupancy target, interrupting "
+                          "re-scheduling in all regions\n");
+    RescheduleRegions.reset();
+  }
 }
 
 void GCNSchedStage::checkScheduling() {
@@ -1518,10 +1869,12 @@ void GCNSchedStage::checkScheduling() {
 
   // Revert if this region's schedule would cause a drop in occupancy or
   // spilling.
-  if (shouldRevertScheduling(WavesAfter))
-    revertScheduling();
-  else
+  if (shouldRevertScheduling(WavesAfter)) {
+    modifyRegionSchedule(RegionIdx, DAG.BB, Unsched);
+    std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
+  } else {
     DAG.Pressure[RegionIdx] = PressureAfter;
+  }
 }
 
 unsigned
@@ -1723,8 +2076,9 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
 }
 
 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
-  return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
-         mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
+  // When trying to increase occupancy (TargetOcc == true) the stage manages
+  // region reverts globally (all or none), so we always return false here.
+  return !TargetOcc && mayCauseSpilling(WavesAfter);
 }
 
 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1749,89 +2103,625 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
   return false;
 }
 
-void GCNSchedStage::revertScheduling() {
-  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
-  DAG.RegionEnd = DAG.RegionBegin;
-  int SkippedDebugInstr = 0;
-  for (MachineInstr *MI : Unsched) {
-    if (MI->isDebugInstr()) {
-      ++SkippedDebugInstr;
-      continue;
-    }
+void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx,
+                                         MachineBasicBlock *MBB,
+                                         ArrayRef<MachineInstr *> MIOrder) {
+  assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
+                                           DAG.Regions[RegionIdx].second)) ==
+             MIOrder.size() &&
+         "instruction number mismatch");
+  if (MIOrder.empty())
+    return;
 
-    if (MI->getIterator() != DAG.RegionEnd) {
-      DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
-      if (!MI->isDebugInstr())
+  LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
+
+  // Reconstruct MI sequence by moving instructions in desired order before
+  // the current region's start.
+  MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
+  for (MachineInstr *MI : MIOrder) {
+    // Either move the next MI in order before the end of the region or move the
+    // region end past the MI if it is at the correct position.
+    MachineBasicBlock::iterator MII = MI->getIterator();
+    if (MII != RegionEnd) {
+      // Will subsequent splice move MI up past a non-debug instruction?
+      bool NonDebugReordered =
+          !MI->isDebugInstr() &&
+          skipDebugInstructionsForward(RegionEnd, MII) != MII;
+      MBB->splice(RegionEnd, MBB, MI);
+      // Only update LiveIntervals information if non-debug instructions are
+      // reordered. Otherwise debug instructions could cause code generation to
+      // change.
+      if (NonDebugReordered)
         DAG.LIS->handleMove(*MI, true);
+    } else {
+      ++RegionEnd;
+    }
+    if (MI->isDebugInstr()) {
+      LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
+      continue;
     }
 
     // Reset read-undef flags and update them later.
-    for (auto &Op : MI->all_defs())
+    for (MachineOperand &Op : MI->all_defs())
       Op.setIsUndef(false);
     RegisterOperands RegOpers;
     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
-    if (!MI->isDebugInstr()) {
-      if (DAG.ShouldTrackLaneMasks) {
-        // Adjust liveness and add missing dead+read-undef flags.
-        SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
-        RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
-      } else {
-        // Adjust for missing dead-def flags.
-        RegOpers.detectDeadDefs(*MI, *DAG.LIS);
-      }
+    if (DAG.ShouldTrackLaneMasks) {
+      // Adjust liveness and add missing dead+read-undef flags.
+      SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
+    } else {
+      // Adjust for missing dead-def flags.
+      RegOpers.detectDeadDefs(*MI, *DAG.LIS);
     }
-    DAG.RegionEnd = MI->getIterator();
-    ++DAG.RegionEnd;
     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
   }
 
-  // After reverting schedule, debug instrs will now be at the end of the block
-  // and RegionEnd will point to the first debug instr. Increment RegionEnd
-  // pass debug instrs to the actual end of the scheduling region.
-  while (SkippedDebugInstr-- > 0)
-    ++DAG.RegionEnd;
+  // The region end doesn't change throughout scheduling since it itself is
+  // outside the region (whether that is a MBB end or a terminator MI).
+  assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
+  DAG.Regions[RegionIdx].first = MIOrder.front();
+}
+
+bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
 
-  // If Unsched.front() instruction is a debug instruction, this will actually
-  // shrink the region since we moved all debug instructions to the end of the
-  // block. Find the first instruction that is not a debug instruction.
-  DAG.RegionBegin = Unsched.front()->getIterator();
-  if (DAG.RegionBegin->isDebugInstr()) {
-    for (MachineInstr *MI : Unsched) {
-      if (MI->isDebugInstr())
+  if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+    return false;
+  return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+bool RewriteMFMAFormStage::initHeuristics(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  bool Changed = false;
+
+  // Prepare for the heuristics
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isRewriteCandidate(&MI))
         continue;
-      DAG.RegionBegin = MI->getIterator();
-      break;
+
+      int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+      assert(ReplacementOp != -1);
+
+      RewriteCands.push_back({&MI, MI.getOpcode()});
+      MI.setDesc(TII->get(ReplacementOp));
+
+      MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2->isReg()) {
+        SmallVector<SlotIndex, 8> Src2ReachingDefs;
+        findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+        // For any definition of the src2 register which is non-MFMA, we
+        // insert a copy.
+        for (SlotIndex RDIdx : Src2ReachingDefs) {
+          MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+          if (!TII->isMAI(*RD))
+            CopyForDef.insert(RD);
+        }
+      }
+
+      MachineOperand &Dst = MI.getOperand(0);
+      SmallVector<MachineOperand *, 8> DstReachingUses;
+
+      findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+      for (MachineOperand *RUOp : DstReachingUses) {
+        if (TII->isMAI(*RUOp->getParent()))
+          continue;
+
+        // For any user of the result of the MFMA which is not an MFMA, we
+        // insert a copy. For a given register, we will only insert one copy
+        // per user block.
+        CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
+
+        SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+        findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+        for (SlotIndex RDIndex : DstUsesReachingDefs) {
+          MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+          if (TII->isMAI(*RD))
+            continue;
+
+          // For any definition of the user of the MFMA which is not an MFMA,
+          // we insert a copy. We do this to transform all the reaching defs
+          // of this use to AGPR. By doing this, we can insert a copy from
+          // AGPR to VGPR at the user rather than after the MFMA.
+          CopyForDef.insert(RD);
+        }
+      }
+
+      // Do the rewrite to allow for updated RP calculation.
+      const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+      const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+      DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
+      if (Src2->isReg())
+        DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+      Changed = true;
     }
   }
 
-  // Then move the debug instructions back into their correct place and set
-  // RegionBegin and RegionEnd if needed.
-  DAG.placeDebugValues();
+  return Changed;
+}
 
-  DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
+int64_t RewriteMFMAFormStage::getRewriteCost(
+    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
+
+  int64_t BestSpillCost = 0;
+  int64_t Cost = 0;
+  uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
+
+  std::pair<unsigned, unsigned> MaxVectorRegs =
+      ST.getMaxNumVectorRegs(MF.getFunction());
+  unsigned ArchVGPRThreshold = MaxVectorRegs.first;
+  unsigned AGPRThreshold = MaxVectorRegs.second;
+  unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    if (!RegionsWithExcessArchVGPR[Region])
+      continue;
+
+    GCNRegPressure &PressureBefore = DAG.Pressure[Region];
+    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
+        MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
+
+    // For the cases we care about (i.e. ArchVGPR usage is greater than the
+    // addressable limit), rewriting alone should bring pressure to manageable
+    // level. If we find any such region, then the rewrite is potentially
+    // beneficial.
+    GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
+    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
+        MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
+
+    uint64_t BlockFreq =
+        MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
+            .getFrequency();
+
+    bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
+    uint64_t RelativeFreq = EntryFreq && BlockFreq
+                                ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
+                                                       : BlockFreq / EntryFreq)
+                                : 1;
+
+    // This assumes perfect spilling / splitting -- using one spill / copy
+    // instruction and one restoreFrom / copy for each excess register,
+    int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
+
+    // Also account for the block frequency.
+    if (RelativeFreqIsDenom)
+      SpillCost /= (int64_t)RelativeFreq;
+    else
+      SpillCost *= (int64_t)RelativeFreq;
+
+    // If we have increased spilling in any block, just bail.
+    if (SpillCost > 0)
+      return SpillCost;
+
+    if (SpillCost < BestSpillCost)
+      BestSpillCost = SpillCost;
+  }
+
+  // Set the cost to the largest decrease in spill cost in order to not double
+  // count spill reductions.
+  Cost = BestSpillCost;
+  assert(Cost <= 0);
+
+  unsigned CopyCost = 0;
+
+  // For each CopyForDef, increase the cost by the register size while
+  // accounting for block frequency.
+  for (MachineInstr *DefMI : CopyForDef) {
+    Register DefReg = DefMI->getOperand(0).getReg();
+    uint64_t DefFreq =
+        EntryFreq
+            ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+            : 1;
+
+    const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
+    CopyCost += RC->getCopyCost() * DefFreq;
+  }
+
+  // Account for CopyForUse copies in each block that the register is used.
+  for (auto &[UseBlock, UseRegs] : CopyForUse) {
+    uint64_t UseFreq =
+        EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
+
+    for (Register UseReg : UseRegs) {
+      const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
+      CopyCost += RC->getCopyCost() * UseFreq;
+    }
+  }
+
+  // Reset the classes that were changed to AGPR for better RB analysis.
+  // We must do rewriting after copy-insertion, as some defs of the register
+  // may require VGPR.  Additionally, if we bail out and don't perform the
+  // rewrite then these need to be restored anyway.
+  for (auto &[MI, OriginalOpcode] : RewriteCands) {
+    assert(TII->isMAI(*MI));
+    const TargetRegisterClass *AGPRRC =
+        DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+    const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    assert(Src2);
+
+    if (Src2->isReg())
+      DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+    MI->setDesc(TII->get(OriginalOpcode));
+  }
+
+  return Cost + CopyCost;
 }
 
-bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
+bool RewriteMFMAFormStage::rewrite(
+    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+  DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
+  DenseMap<MachineInstr *, unsigned> LastMIToRegion;
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    RegionBoundaries Entry = DAG.Regions[Region];
+    if (Entry.first == Entry.second)
+      continue;
+
+    FirstMIToRegion[&*Entry.first] = Region;
+    if (Entry.second != Entry.first->getParent()->end())
+      LastMIToRegion[&*Entry.second] = Region;
+  }
+
+  // Rewrite the MFMAs to AGPR, and insert any copies as needed.
+  // The general assumption of the algorithm (and the previous cost calculation)
+  // is that it is better to insert the copies in the MBB of the def of the src2
+  // operands, and in the MBB of the user of the dest operands. This is based on
+  // the assumption that the MFMAs are likely to appear in loop bodies, while
+  // the src2 and dest operands are live-in / live-out of the loop. Due to this
+  // design, the algorithm for finding copy insertion points is more
+  // complicated.
+  //
+  // There are three main cases to handle: 1. the reaching defs of the src2
+  // operands, 2. the reaching uses of the dst operands, and 3. the reaching
+  // defs of the reaching uses of the dst operand.
+  //
+  // In the first case, we simply insert copies after each of the reaching
+  // definitions. In the second case, we collect all the uses of a given dest
+  // and organize them by MBB. Then, we insert 1 copy for each MBB before the
+  // earliest use. Since the use may have multiple reaching defs, and since we
+  // want to replace the register it is using with the result of the copy, we
+  // must handle case 3. In the third case, we simply insert a copy after each
+  // of the reaching defs to connect to the copy of the reaching uses of the dst
+  // reg. This allows us to avoid inserting copies next to the MFMAs.
+  //
+  // While inserting the copies, we maintain a map of operands which will use
+  // different regs (i.e. the result of the copies). For example, a case 1 src2
+  // operand will use the register result of the copies after the reaching defs,
+  // as opposed to the original register. Now that we have completed our copy
+  // analysis and placement, we can bulk update the registers. We do this
+  // separately as to avoid complicating the reachingDef and reachingUse
+  // queries.
+  //
+  // While inserting the copies, we also maintain a list or registers which we
+  // will want to reclassify as AGPR. After doing the copy insertion and the
+  // register replacement, we can finally do the reclassification. This uses the
+  // redef map, as the registers we are interested in reclassifying may be
+  // replaced by the result of a copy. We must do this after the copy analysis
+  // and placement as we must have an accurate redef map -- otherwise we may end
+  // up creating illegal instructions.
+
+  // The original registers of the MFMA that need to be reclassified as AGPR.
+  DenseSet<Register> RewriteRegs;
+  // The map of an original register in the MFMA to a new register (result of a
+  // copy) that it should be replaced with.
+  DenseMap<Register, Register> RedefMap;
+  // The map of the original MFMA registers to the relevant MFMA operands.
+  DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
+  // The map of reaching defs for a given register -- to avoid duplicate copies.
+  DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
+  // The map of reaching uses for a given register by basic block -- to avoid
+  // duplicate copies and to calculate per MBB insert pts.
+  DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
+      ReachingUseTracker;
+
+  for (auto &[MI, OriginalOpcode] : RewriteCands) {
+    int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
+    if (ReplacementOp == -1)
+      continue;
+    MI->setDesc(TII->get(ReplacementOp));
+
+    // Case 1: insert copies for the reaching defs of the Src2Reg.
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    if (Src2->isReg()) {
+      Register Src2Reg = Src2->getReg();
+      if (!Src2Reg.isVirtual())
+        return false;
+
+      Register MappedReg = Src2->getReg();
+      SmallVector<SlotIndex, 8> Src2ReachingDefs;
+      findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+      SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
+
+      for (SlotIndex RDIndex : Src2ReachingDefs) {
+        MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+        if (TII->isMAI(*RD))
+          continue;
+
+        // If there is a non mai reaching def, then we need a copy.
+        Src2DefsReplace.insert(RD);
+      }
+
+      if (!Src2DefsReplace.empty()) {
+        DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
+        if (RI != RedefMap.end()) {
+          MappedReg = RI->second;
+        } else {
+          assert(!ReachingDefCopyMap.contains(Src2Reg));
+          const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
+          const TargetRegisterClass *VGPRRC =
+              SRI->getEquivalentVGPRClass(Src2RC);
+
+          // Track the mapping of the original register to the new register.
+          MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+          RedefMap[Src2Reg] = MappedReg;
+        }
+
+        // If none exists, create a copy from this reaching def.
+        // We may have inserted a copy already in an earlier iteration.
+        for (MachineInstr *RD : Src2DefsReplace) {
+          // Do not create redundant copies.
+          if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
+            MachineInstrBuilder VGPRCopy =
+                BuildMI(*RD->getParent(), std::next(RD->getIterator()),
+                        RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                    .addDef(MappedReg, {}, 0)
+                    .addUse(Src2Reg, {}, 0);
+            DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+            // If this reaching def was the last MI in the region, update the
+            // region boundaries.
+            if (LastMIToRegion.contains(RD)) {
+              unsigned UpdateRegion = LastMIToRegion[RD];
+              DAG.Regions[UpdateRegion].second = VGPRCopy;
+              LastMIToRegion.erase(RD);
+            }
+          }
+        }
+      }
+
+      // Track the register for reclassification
+      RewriteRegs.insert(Src2Reg);
+
+      // Always insert the operand for replacement. If this corresponds with a
+      // chain of tied-def we may not see the VGPR requirement until later.
+      ReplaceMap[Src2Reg].insert(Src2);
+    }
+
+    // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
+    // and after the reaching defs of the reaching uses of the dsts.
+
+    MachineOperand *Dst = &MI->getOperand(0);
+    Register DstReg = Dst->getReg();
+    if (!DstReg.isVirtual())
+      return false;
+
+    Register MappedReg = DstReg;
+    SmallVector<MachineOperand *, 8> DstReachingUses;
+
+    SmallVector<MachineOperand *, 8> DstReachingUseCopies;
+    SmallVector<MachineInstr *, 8> DstUseDefsReplace;
+
+    findReachingUses(MI, DAG.LIS, DstReachingUses);
+
+    for (MachineOperand *RUOp : DstReachingUses) {
+      if (TII->isMAI(*RUOp->getParent()))
+        continue;
+
+      // If there is a non mai reaching use, then we need a copy.
+      if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
+        DstReachingUseCopies.push_back(RUOp);
+      SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+      findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+      for (SlotIndex RDIndex : DstUsesReachingDefs) {
+        MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+        if (TII->isMAI(*RD))
+          continue;
+
+        // If there is a non mai reaching def of this reaching use, then we will
+        // need a copy.
+        if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
+          DstUseDefsReplace.push_back(RD);
+      }
+    }
+
+    if (!DstUseDefsReplace.empty()) {
+      DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
+      if (RI != RedefMap.end()) {
+        MappedReg = RI->second;
+      } else {
+        assert(!ReachingDefCopyMap.contains(DstReg));
+        const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+        const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+
+        // Track the mapping of the original register to the new register.
+        MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+        RedefMap[DstReg] = MappedReg;
+      }
+
+      // If none exists, create a copy from this reaching def.
+      // We may have inserted a copy already in an earlier iteration.
+      for (MachineInstr *RD : DstUseDefsReplace) {
+        // Do not create reundant copies.
+        if (ReachingDefCopyMap[DstReg].insert(RD).second) {
+          MachineInstrBuilder VGPRCopy =
+              BuildMI(*RD->getParent(), std::next(RD->getIterator()),
+                      RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                  .addDef(MappedReg, {}, 0)
+                  .addUse(DstReg, {}, 0);
+          DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+          // If this reaching def was the last MI in the region, update the
+          // region boundaries.
+          DenseMap<MachineInstr *, unsigned>::iterator LMI =
+              LastMIToRegion.find(RD);
+          if (LMI != LastMIToRegion.end()) {
+            unsigned UpdateRegion = LMI->second;
+            DAG.Regions[UpdateRegion].second = VGPRCopy;
+            LastMIToRegion.erase(RD);
+          }
+        }
+      }
+    }
+
+    DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
+    for (MachineOperand *RU : DstReachingUseCopies) {
+      MachineBasicBlock *RUBlock = RU->getParent()->getParent();
+      // Just keep track of the reaching use of this register by block. After we
+      // have scanned all the MFMAs we can find optimal insert pts.
+      if (RUBlock != MI->getParent()) {
+        ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
+        continue;
+      }
+
+      // Special case, the use is in the same block as the MFMA. Insert the copy
+      // just before the use.
+      const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+      const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+      Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+      MachineInstr *UseInst = RU->getParent();
+      MachineInstrBuilder VGPRCopy =
+          BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+                  UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              .addDef(NewUseReg, {}, 0)
+              .addUse(DstReg, {}, 0);
+      DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+      // Since we know this use has only one reaching def, we can replace the
+      // use reg.
+      RU->setReg(NewUseReg);
+      // Track the copy source operand for r eplacement.
+      DstRegSet.insert(&VGPRCopy->getOperand(1));
+    }
+
+    // Track the register for reclassification
+    RewriteRegs.insert(DstReg);
+
+    // Insert the dst operand for replacement. If this dst is in a chain of
+    // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
+    // all the correspond operands need to be replaced.
+    DstRegSet.insert(Dst);
+  }
+
+  // Handle the copies for dst uses.
+  using RUBType =
+      std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
+  for (RUBType RUBlockEntry : ReachingUseTracker) {
+    using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
+    for (RUDType RUDst : RUBlockEntry.second) {
+      MachineOperand *OpBegin = *RUDst.second.begin();
+      SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
+
+      // Find the earliest use in this block.
+      for (MachineOperand *User : RUDst.second) {
+        SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
+        if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
+          InstPt = NewInstPt;
+      }
+
+      const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
+      const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+      Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+      MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
+
+      MachineInstrBuilder VGPRCopy =
+          BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+                  UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              .addDef(NewUseReg, {}, 0)
+              .addUse(RUDst.first, {}, 0);
+      DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+      // If this UseInst was the first MI in the region, update the region
+      // boundaries.
+      DenseMap<MachineInstr *, unsigned>::iterator FI =
+          FirstMIToRegion.find(UseInst);
+      if (FI != FirstMIToRegion.end()) {
+        unsigned UpdateRegion = FI->second;
+        DAG.Regions[UpdateRegion].first = VGPRCopy;
+        FirstMIToRegion.erase(UseInst);
+      }
+
+      // Replace the operand for all users.
+      for (MachineOperand *User : RUDst.second) {
+        User->setReg(NewUseReg);
+      }
+
+      // Track the copy source operand for replacement.
+      ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
+    }
+  }
+
+  // We may have needed to insert copies after the reaching defs of the MFMAs.
+  // Replace the original register with the result of the copy for all relevant
+  // operands.
+  for (std::pair<Register, Register> NewDef : RedefMap) {
+    Register OldReg = NewDef.first;
+    Register NewReg = NewDef.second;
+
+    // Replace the register for any associated operand in the MFMA chain.
+    for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
+      ReplaceOp->setReg(NewReg);
+  }
+
+  // Finally, do the reclassification of the MFMA registers.
+  for (Register RewriteReg : RewriteRegs) {
+    Register RegToRewrite = RewriteReg;
+
+    // Be sure to update the replacement register and not the original.
+    DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
+    if (RI != RedefMap.end())
+      RegToRewrite = RI->second;
+
+    const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
+    const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
+
+    DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
+  }
+
+  // Bulk update the LIS.
+  DAG.LIS->reanalyze(DAG.MF);
+  // Liveins may have been modified for cross RC copies
+  RegionPressureMap LiveInUpdater(&DAG, false);
+  LiveInUpdater.buildLiveRegMap();
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
+    DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
+
+  DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
+
+  return true;
+}
+
+unsigned PreRARematStage::getStageTargetOccupancy() const {
+  return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
+}
+
+bool PreRARematStage::setObjective() {
   const Function &F = MF.getFunction();
 
-  // Maps optimizable regions (i.e., regions at minimum and register-limited
-  // occupancy, or regions with spilling) to the target RP we would like to
-  // reach.
-  DenseMap<unsigned, GCNRPTarget> OptRegions;
+  // Set up "spilling targets" for all regions.
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
-  auto ResetTargetRegions = [&]() {
-    OptRegions.clear();
-    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-      const GCNRegPressure &RP = DAG.Pressure[I];
-      GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
-      if (!Target.satisfied())
-        OptRegions.insert({I, Target});
-    }
-  };
+  bool HasVectorRegisterExcess = false;
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    const GCNRegPressure &RP = DAG.Pressure[I];
+    GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
+    if (!Target.satisfied())
+      TargetRegions.set(I);
+    HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
+  }
 
-  ResetTargetRegions();
-  if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
+  if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
     // In addition to register usage being above addressable limits, occupancy
     // below the minimum is considered like "spilling" as well.
     TargetOcc = std::nullopt;
@@ -1839,94 +2729,68 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
     // There is no spilling and room to improve occupancy; set up "increased
     // occupancy targets" for all regions.
     TargetOcc = DAG.MinOccupancy + 1;
-    unsigned VGPRBlockSize =
-        MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+    const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
     MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
     MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
-    ResetTargetRegions();
-  }
-  REMAT_DEBUG({
-    dbgs() << "Analyzing ";
-    MF.getFunction().printAsOperand(dbgs(), false);
-    dbgs() << ": ";
-    if (OptRegions.empty()) {
-      dbgs() << "no objective to achieve, occupancy is maximal at "
-             << MFI.getMaxWavesPerEU();
-    } else if (!TargetOcc) {
-      dbgs() << "reduce spilling (minimum target occupancy is "
-             << MFI.getMinWavesPerEU() << ')';
-    } else {
-      dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
-             << TargetOcc;
-    }
-    dbgs() << '\n';
-    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-      if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
-        dbgs() << REMAT_PREFIX << "  [" << I << "] " << OptIt->getSecond()
-               << '\n';
-      }
+    for (auto [I, Target] : enumerate(RPTargets)) {
+      Target.setTarget(MaxSGPRs, MaxVGPRs);
+      if (!Target.satisfied())
+        TargetRegions.set(I);
     }
-  });
-  if (OptRegions.empty())
-    return false;
+  }
 
-  // Accounts for a reduction in RP in an optimizable region. Returns whether we
-  // estimate that we have identified enough rematerialization opportunities to
-  // achieve our goal, and sets Progress to true when this particular reduction
-  // in pressure was helpful toward that goal.
-  auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
-                              bool &Progress) -> bool {
-    GCNRPTarget &Target = OptIt->getSecond();
-    if (!Target.isSaveBeneficial(Reg))
-      return false;
-    Progress = true;
-    Target.saveReg(Reg, Mask, DAG.MRI);
-    if (Target.satisfied())
-      OptRegions.erase(OptIt->getFirst());
-    return OptRegions.empty();
-  };
+  return TargetRegions.any();
+}
 
+bool PreRARematStage::collectRematRegs(
+    const DenseMap<MachineInstr *, unsigned> &MIRegion) {
   // We need up-to-date live-out info. to query live-out register masks in
   // regions containing rematerializable instructions.
   DAG.RegionLiveOuts.buildLiveRegMap();
 
-  // Cache set of registers that are going to be rematerialized.
-  DenseSet<unsigned> RematRegs;
+  // Set of registers already marked for potential remterialization; used to
+  // avoid rematerialization chains.
+  SmallSet<Register, 4> MarkedRegs;
+  auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
+    return MO.isReg() && MarkedRegs.contains(MO.getReg());
+  };
 
   // Identify rematerializable instructions in the function.
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-    auto Region = DAG.Regions[I];
-    for (auto MI = Region.first; MI != Region.second; ++MI) {
+    RegionBoundaries Bounds = DAG.Regions[I];
+    for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
       // The instruction must be rematerializable.
       MachineInstr &DefMI = *MI;
       if (!isReMaterializable(DefMI))
         continue;
 
-      // We only support rematerializing virtual registers with one definition.
+      // We only support rematerializing virtual registers with one
+      // definition.
       Register Reg = DefMI.getOperand(0).getReg();
       if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
         continue;
 
       // We only care to rematerialize the instruction if it has a single
-      // non-debug user in a different region. The using MI may not belong to a
-      // region if it is a lone region terminator.
+      // non-debug user in a different region.
+      // FIXME: Allow rematerializations with multiple uses. This should be
+      // relatively easy to support using the current cost model.
       MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
       if (!UseMI)
         continue;
       auto UseRegion = MIRegion.find(UseMI);
-      if (UseRegion != MIRegion.end() && UseRegion->second == I)
+      if (UseRegion == MIRegion.end() || UseRegion->second == I)
         continue;
 
       // Do not rematerialize an instruction if it uses or is used by an
       // instruction that we have designated for rematerialization.
       // FIXME: Allow for rematerialization chains: this requires 1. updating
-      // remat points to account for uses that are rematerialized, and 2. either
-      // rematerializing the candidates in careful ordering, or deferring the
-      // MBB RP walk until the entire chain has been rematerialized.
-      if (Rematerializations.contains(UseMI) ||
-          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
-            return MO.isReg() && RematRegs.contains(MO.getReg());
-          }))
+      // remat points to account for uses that are rematerialized, and 2.
+      // either rematerializing the candidates in careful ordering, or
+      // deferring the MBB RP walk until the entire chain has been
+      // rematerialized.
+      const MachineOperand &UseMO = UseMI->getOperand(0);
+      if (IsMarkedForRemat(UseMO) ||
+          llvm::any_of(DefMI.operands(), IsMarkedForRemat))
         continue;
 
       // Do not rematerialize an instruction it it uses registers that aren't
@@ -1937,188 +2801,257 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
                                               *DAG.TII))
         continue;
 
-      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
-      RematInstruction &Remat =
-          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
-
-      bool RematUseful = false;
-      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
-        // Optimistically consider that moving the instruction out of its
-        // defining region will reduce RP in the latter; this assumes that
-        // maximum RP in the region is reached somewhere between the defining
-        // instruction and the end of the region.
-        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
-        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
-        if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
-          return true;
-      }
-
-      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
-        // We are only collecting regions in which the register is a live-in
-        // (and may be live-through).
-        auto It = DAG.LiveIns[LIRegion].find(Reg);
-        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
-          continue;
-        Remat.LiveInRegions.insert(LIRegion);
-
-        // Account for the reduction in RP due to the rematerialization in an
-        // optimizable region in which the defined register is a live-in. This
-        // is exact for live-through region but optimistic in the using region,
-        // where RP is actually reduced only if maximum RP is reached somewhere
-        // between the beginning of the region and the rematerializable
-        // instruction's use.
-        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
-          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
-          if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
-                               RematUseful))
-            return true;
-        }
-      }
-
-      // If the instruction is not a live-in or live-out in any optimizable
-      // region then there is no point in rematerializing it.
-      if (!RematUseful) {
-        Rematerializations.pop_back();
-        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
-      } else {
-        RematRegs.insert(Reg);
-      }
+      // Add the instruction to the rematerializable list.
+      MarkedRegs.insert(Reg);
+      RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
     }
   }
 
-  if (TargetOcc) {
-    // We were trying to increase occupancy but failed, abort the stage.
-    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
-    Rematerializations.clear();
-    return false;
+  return !RematRegs.empty();
+}
+
+PreRARematStage::RematReg::RematReg(
+    MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
+    const DenseMap<MachineInstr *, unsigned> &MIRegion)
+    : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
+      LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
+      DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
+
+  // Mark regions in which the rematerializable register is live.
+  Register Reg = getReg();
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    auto LiveInIt = DAG.LiveIns[I].find(Reg);
+    if (LiveInIt != DAG.LiveIns[I].end())
+      LiveIn.set(I);
+    const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I);
+    if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end())
+      LiveOut.set(I);
+  }
+  Live |= LiveIn;
+  Live |= LiveOut;
+  Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg);
+}
+
+bool PreRARematStage::RematReg::maybeBeneficial(
+    const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
+  Register Reg = getReg();
+  for (unsigned I : TargetRegions.set_bits()) {
+    if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
+      return true;
   }
-  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
-  return !Rematerializations.empty();
+  return false;
 }
 
-void PreRARematStage::rematerialize() {
-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
+                                         MachineInstr *RematMI,
+                                         GCNScheduleDAGMILive &DAG) const {
+  RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
+  if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
+    Bounds.first = RematMI;
+  DAG.LIS->InsertMachineInstrInMaps(*RematMI);
+  DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
+}
+
+PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
+    MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
+  assert(DAG.MLI && "MLI not defined in DAG");
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
 
-  // Collect regions whose RP changes in unpredictable way; we will have to
-  // fully recompute their RP after all rematerailizations.
-  DenseSet<unsigned> RecomputeRP;
-
-  // Rematerialize all instructions.
-  for (auto &[DefMI, Remat] : Rematerializations) {
-    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
-    Register Reg = DefMI->getOperand(0).getReg();
-    unsigned DefRegion = MIRegion.at(DefMI);
-
-    // Rematerialize DefMI to its use block.
-    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                       AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
-    Remat.RematMI = &*std::prev(InsertPos);
-    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
-
-    // Update region boundaries in regions we sinked from (remove defining MI)
-    // and to (insert MI rematerialized in use block). Only then we can erase
-    // the original MI.
-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
-    auto UseRegion = MIRegion.find(Remat.UseMI);
-    if (UseRegion != MIRegion.end()) {
-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
-                                 Remat.RematMI);
+  const unsigned NumRegions = DAG.Regions.size();
+  MinFreq = MBFI.getEntryFreq().getFrequency();
+  MaxFreq = 0;
+  Regions.reserve(NumRegions);
+  for (unsigned I = 0; I < NumRegions; ++I) {
+    MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
+    uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
+    Regions.push_back(BlockFreq);
+    if (BlockFreq && BlockFreq < MinFreq)
+      MinFreq = BlockFreq;
+    else if (BlockFreq > MaxFreq)
+      MaxFreq = BlockFreq;
+  }
+  if (!MinFreq)
+    return;
+
+  // Scale everything down if frequencies are high.
+  if (MinFreq >= ScaleFactor * ScaleFactor) {
+    for (uint64_t &Freq : Regions)
+      Freq /= ScaleFactor;
+    MinFreq /= ScaleFactor;
+    MaxFreq /= ScaleFactor;
+  }
+}
+
+PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
+                                          const GCNScheduleDAGMILive &DAG)
+    : Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {}
+
+unsigned PreRARematStage::ScoredRemat::getNumRegs(
+    const GCNScheduleDAGMILive &DAG) const {
+  const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Remat->getReg());
+  unsigned RegSize = DAG.TRI->getRegSizeInBits(RC);
+  if (unsigned SubIdx = Remat->DefMI->getOperand(0).getSubReg()) {
+    // The following may return -1 (i.e., a large unsigned number) on indices
+    // that may be used to access subregisters of multiple sizes; in such cases
+    // fallback on the size derived from the register class.
+    unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(SubIdx);
+    if (SubRegSize < RegSize)
+      RegSize = SubRegSize;
+  }
+  return divideCeil(RegSize, 32);
+}
+
+int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
+  // Get frequencies of defining and using regions. A rematerialization from the
+  // least frequent region to the most frequent region will yield the greatest
+  // latency penalty and therefore should get minimum score. Reciprocally, a
+  // rematerialization in the other direction should get maximum score. Default
+  // to values that will yield the worst possible score given known frequencies
+  // in order to penalize rematerializations from or into regions whose
+  // frequency is unknown.
+  int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq);
+  int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
+  if (!UseOrMax)
+    UseOrMax = Freq.MaxFreq;
+  return DefOrMin - UseOrMax;
+}
+
+void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
+                                          ArrayRef<GCNRPTarget> RPTargets,
+                                          const FreqInfo &FreqInfo,
+                                          bool ReduceSpill) {
+  MaxFreq = 0;
+  RegionImpact = 0;
+  for (unsigned I : TargetRegions.set_bits()) {
+    if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Remat->getReg()))
+      continue;
+    bool UnusedLT = Remat->isUnusedLiveThrough(I);
+
+    // Regions in which RP is guaranteed to decrease have more weight.
+    RegionImpact += UnusedLT ? 2 : 1;
+
+    if (ReduceSpill) {
+      uint64_t Freq = FreqInfo.Regions[I];
+      if (!UnusedLT) {
+        // Apply a frequency penalty in regions in which we are not sure that RP
+        // will decrease.
+        Freq /= 2;
+      }
+      MaxFreq = std::max(MaxFreq, Freq);
     }
-    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
-    DefMI->eraseFromParent();
+  }
+  RegionImpact *= NumRegs;
+}
 
-    // Collect all regions impacted by the rematerialization and update their
-    // live-in/RP information.
-    for (unsigned I : Remat.LiveInRegions) {
-      ImpactedRegions.insert({I, DAG.Pressure[I]});
-      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+void PreRARematStage::rematerialize(const RematReg &Remat,
+                                    BitVector &RecomputeRP,
+                                    RollbackInfo *Rollback) {
+  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+  MachineInstr &DefMI = *Remat.DefMI;
+  Register Reg = DefMI.getOperand(0).getReg();
+  Register NewReg = DAG.MRI.cloneVirtualRegister(Reg);
+
+  // Rematerialize the register in the region where it is used.
+  MachineBasicBlock::iterator InsertPos = Remat.UseMI;
+  TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI);
+  MachineInstr *RematMI = &*std::prev(InsertPos);
+  Remat.UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI);
+  Remat.insertMI(Remat.UseRegion, RematMI, DAG);
+  if (Rollback) {
+    Rollback->RematMI = RematMI;
+    // Make the original MI a debug value so that it does not influence
+    // scheduling and replace all read registers with a sentinel register to
+    // prevent operands to appear in use-lists of other MIs during LIS
+    // updates. Store mappings between operand indices and original registers
+    // for potential rollback.
+    DefMI.setDesc(TII->get(TargetOpcode::DBG_VALUE));
+    for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) {
+      if (MO.isReg() && MO.readsReg()) {
+        Rollback->RegMap.insert({Idx, MO.getReg()});
+        MO.setReg(Register());
+      }
+    }
+  } else {
+    // Just delete the original instruction if it cannot be rolled back.
+    DAG.deleteMI(Remat.DefRegion, &DefMI);
+  }
 
 #ifdef EXPENSIVE_CHECKS
-      // All uses are known to be available / live at the remat point. Thus, the
-      // uses should already be live in to the region.
-      for (MachineOperand &MO : DefMI->operands()) {
-        if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
-          continue;
+  // All uses are known to be available / live at the remat point. Thus,
+  // the uses should already be live in to the using region.
+  for (MachineOperand &MO : DefMI.operands()) {
+    if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+      continue;
 
-        Register UseReg = MO.getReg();
-        if (!UseReg.isVirtual())
-          continue;
+    Register UseReg = MO.getReg();
+    if (!UseReg.isVirtual())
+      continue;
 
-        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
-        LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
-        if (LI.hasSubRanges() && MO.getSubReg())
-          LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
-
-        LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
-        LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
-        // If this register has lanes not covered by the LiveIns, be sure they
-        // do not map to any subrange. ref:
-        // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
-        if (UncoveredLanes.any()) {
-          assert(LI.hasSubRanges());
-          for (LiveInterval::SubRange &SR : LI.subranges())
-            assert((SR.LaneMask & UncoveredLanes).none());
-        }
-      }
+    LiveInterval &LI = DAG.LIS->getInterval(UseReg);
+    LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
+    if (LI.hasSubRanges() && MO.getSubReg())
+      LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
+
+    LaneBitmask LiveInMask = DAG.LiveIns[Remat.UseRegion].at(UseReg);
+    LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
+    // If this register has lanes not covered by the LiveIns, be sure they
+    // do not map to any subrange. ref:
+    // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
+    if (UncoveredLanes.any()) {
+      assert(LI.hasSubRanges());
+      for (LiveInterval::SubRange &SR : LI.subranges())
+        assert((SR.LaneMask & UncoveredLanes).none());
+    }
+  }
 #endif
 
-      // The register is no longer a live-in in all regions but the one that
-      // contains the single use. In live-through regions, maximum register
-      // pressure decreases predictably so we can directly update it. In the
-      // using region, maximum RP may or may not decrease, so we will mark it
-      // for re-computation after all materializations have taken place.
-      LaneBitmask PrevMask = RegionLiveIns[Reg];
-      RegionLiveIns.erase(Reg);
-      RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
-      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
-        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
-      else
-        RecomputeRP.insert(I);
+  // Remove the register from all regions where it is a live-in or live-out
+  // and adjust RP targets. The save is guaranteed in regions in which the
+  // register is live-through and unused but optimistic in all other regions
+  // where the register is live.
+  for (unsigned I : Remat.Live.set_bits()) {
+    RPTargets[I].saveReg(Reg, Remat.Mask, DAG.MRI);
+    DAG.LiveIns[I].erase(Reg);
+    DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
+    if (!Remat.isUnusedLiveThrough(I))
+      RecomputeRP.set(I);
+  }
+
+  RescheduleRegions |= Remat.Live;
+}
+
+void PreRARematStage::commitRematerializations() const {
+  REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
+  for (const RollbackInfo &Rollback : Rollbacks)
+    DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI);
+}
+
+void PreRARematStage::unsetSatisifedRPTargets(const BitVector &Regions) {
+  for (unsigned I : Regions.set_bits()) {
+    if (TargetRegions[I] && RPTargets[I].satisfied()) {
+      REMAT_DEBUG(dbgs() << "  [" << I << "] Target reached!\n");
+      TargetRegions.reset(I);
     }
-    // RP in the region from which the instruction was rematerialized may or may
-    // not decrease.
-    ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
-    RecomputeRP.insert(DefRegion);
-
-    // Recompute live interval to reflect the register's rematerialization.
-    Register RematReg = Remat.RematMI->getOperand(0).getReg();
-    DAG.LIS->removeInterval(RematReg);
-    DAG.LIS->createAndComputeVirtRegInterval(RematReg);
-  }
-
-  // All regions impacted by at least one rematerialization must be rescheduled.
-  // Maximum pressure must also be recomputed for all regions where it changed
-  // non-predictably and checked against the target occupancy.
-  unsigned DynamicVGPRBlockSize =
-      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-  AchievedOcc = MFI.getMaxWavesPerEU();
-  for (auto &[I, OriginalRP] : ImpactedRegions) {
-    bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
-    RescheduleRegions[I] = !IsEmptyRegion;
-    if (!RecomputeRP.contains(I))
-      continue;
+  }
+}
 
-    GCNRegPressure RP;
-    if (IsEmptyRegion) {
-      RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
-    } else {
-      GCNDownwardRPTracker RPT(*DAG.LIS);
-      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
-                                                      DAG.Regions[I].second);
-      if (NonDbgMI == DAG.Regions[I].second) {
-        // Region is non-empty but contains only debug instructions.
-        RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
-      } else {
-        RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
-        RPT.advance(DAG.Regions[I].second);
-        RP = RPT.moveMaxPressure();
-      }
+bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
+  bool TooOptimistic = false;
+  for (unsigned I : Regions.set_bits()) {
+    GCNRPTarget &Target = RPTargets[I];
+    Target.setRP(DAG.getRealRegPressure(I));
+
+    // Since we were optimistic in assessing RP decreases in these regions, we
+    // may need to remark the target as a target region if RP didn't decrease
+    // as expected.
+    if (!TargetRegions[I] && !Target.satisfied()) {
+      REMAT_DEBUG(dbgs() << "  [" << I << "] Incorrect RP estimation\n");
+      TooOptimistic = true;
+      TargetRegions.set(I);
     }
-    DAG.Pressure[I] = RP;
-    AchievedOcc =
-        std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
   }
-  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
+  return TooOptimistic;
 }
 
 // Copied from MachineLICM
@@ -2141,80 +3074,116 @@ bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
 
 void PreRARematStage::finalizeGCNSchedStage() {
   // We consider that reducing spilling is always beneficial so we never
-  // rollback rematerializations in such cases. It's also possible that
-  // rescheduling lowers occupancy over the one achieved just through remats, in
-  // which case we do not want to rollback either (the rescheduling was already
-  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
-  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
-  if (!TargetOcc || MaxOcc >= *TargetOcc)
+  // rollback rematerializations or revert scheduling in such cases.
+  if (!TargetOcc)
     return;
 
-  REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+  // When increasing occupancy, it is possible that re-scheduling is not able to
+  // achieve the target occupancy in all regions, in which case re-scheduling in
+  // all regions should be reverted.
+  if (DAG.MinOccupancy >= *TargetOcc) {
+    commitRematerializations();
+    return;
+  }
 
-  // Rollback the rematerializations.
-  for (const auto &[DefMI, Remat] : Rematerializations) {
-    MachineInstr &RematMI = *Remat.RematMI;
-    unsigned DefRegion = MIRegion.at(DefMI);
-    MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
-    MachineBasicBlock *MBB = RegionBB[DefRegion];
-    Register Reg = RematMI.getOperand(0).getReg();
-
-    // Re-rematerialize MI at the end of its original region. Note that it may
-    // not be rematerialized exactly in the same position as originally within
-    // the region, but it should not matter much.
-    TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
-                       *DAG.TRI);
-    MachineInstr *NewMI = &*std::prev(InsertPos);
-    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
-
-    auto UseRegion = MIRegion.find(Remat.UseMI);
-    if (UseRegion != MIRegion.end()) {
-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
-                                 nullptr);
+  // It is possible that re-scheduling lowers occupancy over the one achieved
+  // just through rematerializations, in which case we revert re-scheduling in
+  // all regions but do not roll back rematerializations.
+  const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
+
+  // When we both need to revert re-scheduling and rollback rematerializations,
+  // restore rematerialized MIs' original state before reverting so that they
+  // are treated as non-debug instructions by the revert logic.
+  if (ShouldRollbackRemats) {
+    for (const RollbackInfo &Rollback : Rollbacks) {
+      const auto &[Remat, RematMI, RegMap] = Rollback;
+      Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode()));
+      for (const auto &[MOIdx, Reg] : RegMap)
+        Remat->DefMI->getOperand(MOIdx).setReg(Reg);
     }
-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
+  }
 
-    // Erase rematerialized MI.
-    DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
-    RematMI.eraseFromParent();
+  // Revert re-scheduling in all affected regions.
+  for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
+    REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
+                       << '\n');
+    DAG.Pressure[RegionIdx] = MaxPressure;
+    modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder);
+  }
 
-    // Recompute live interval for the re-rematerialized register
+  if (!ShouldRollbackRemats) {
+    commitRematerializations();
+    DAG.setTargetOccupancy(AchievedOcc);
+    return;
+  }
+
+  // Reset the target occupancy to what it was pre-rematerialization.
+  DAG.setTargetOccupancy(*TargetOcc - 1);
+
+  // Finish rolling back rematerializations, then recompute pressure in all
+  // affected regions.
+  REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
+  BitVector RecomputeRP(DAG.Regions.size());
+  DenseSet<Register> RecomputeLI;
+  for (const RollbackInfo &Rollback : Rollbacks) {
+    const auto &[Remat, RematMI, RegMap] = Rollback;
+
+    // Switch back to using the original register and delete the
+    // rematerialization.
+    Register Reg = RematMI->getOperand(0).getReg();
+    Register OriginalReg = Remat->DefMI->getOperand(0).getReg();
+    Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI);
+    REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
+                       << "] Deleting rematerialization " << *RematMI);
+    DAG.deleteMI(Remat->UseRegion, RematMI);
+
+    // Re-add the defined register as a live-in/live-out in all regions it used
+    // to be one in.
+    std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
+    for (unsigned I : Remat->LiveIn.set_bits())
+      DAG.LiveIns[I].insert(LiveReg);
+    for (unsigned I : Remat->LiveOut.set_bits())
+      DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
+
+    RecomputeRP |= Rollback.Remat->Live;
+    // Regenerate intervals for all register operands of rematerialized MIs as
+    // slot indices may have changed slightly from before re-scheduling.
+    for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
+      if (MO.isReg() && MO.getReg().isVirtual())
+        RecomputeLI.insert(MO.getReg());
+    }
+  }
+  for (Register Reg : RecomputeLI) {
     DAG.LIS->removeInterval(Reg);
     DAG.LIS->createAndComputeVirtRegInterval(Reg);
-
-    // Re-add the register as a live-in in all regions it used to be one in.
-    for (unsigned LIRegion : Remat.LiveInRegions)
-      DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
   }
-
-  // Reset RP in all impacted regions.
-  for (auto &[I, OriginalRP] : ImpactedRegions)
-    DAG.Pressure[I] = OriginalRP;
+#ifdef EXPENSIVE_CHECKS
+  // In particular, we want to check for coherent MI/slot order in regions in
+  // which reverts and/or rollbacks may have happened.
+  MF.verify();
+#endif
+  for (unsigned I : RecomputeRP.set_bits())
+    DAG.Pressure[I] = DAG.getRealRegPressure(I);
 
   GCNSchedStage::finalizeGCNSchedStage();
 }
 
-void GCNScheduleDAGMILive::updateRegionBoundaries(
-    RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
-    MachineInstr *NewMI) {
-  assert((!NewMI || NewMI != RegionBounds.second) &&
-         "cannot remove at region end");
-
-  if (RegionBounds.first == RegionBounds.second) {
-    assert(NewMI && "cannot remove from an empty region");
-    RegionBounds.first = NewMI;
-    return;
-  }
+void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
+  // It's not possible for the deleted instruction to be upper region boundary
+  // since we don't delete region terminators.
+  if (Regions[RegionIdx].first == MI)
+    Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI));
+  LIS->removeInterval(MI->getOperand(0).getReg());
+  LIS->RemoveMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+}
 
-  // We only care for modifications at the beginning of a non-empty region since
-  // the upper region boundary is exclusive.
-  if (MI != RegionBounds.first)
-    return;
-  if (!NewMI)
-    RegionBounds.first = std::next(MI); // Removal
+void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
+  MinOccupancy = TargetOccupancy;
+  if (MFI.getOccupancy() < TargetOccupancy)
+    MFI.increaseOccupancy(MF, MinOccupancy);
   else
-    RegionBounds.first = NewMI; // Insertion
+    MFI.limitOccupancy(MinOccupancy);
 }
 
 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 975781f..6b6a403 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -16,6 +16,9 @@
 #include "GCNRegPressure.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
@@ -28,11 +31,12 @@ class GCNSchedStage;
 
 enum class GCNSchedStageID : unsigned {
   OccInitialSchedule = 0,
-  UnclusteredHighRPReschedule = 1,
-  ClusteredLowOccupancyReschedule = 2,
-  PreRARematerialize = 3,
-  ILPInitialSchedule = 4,
-  MemoryClauseInitialSchedule = 5
+  RewriteMFMAForm = 1,
+  UnclusteredHighRPReschedule = 2,
+  ClusteredLowOccupancyReschedule = 3,
+  PreRARematerialize = 4,
+  ILPInitialSchedule = 5,
+  MemoryClauseInitialSchedule = 6
 };
 
 #ifndef NDEBUG
@@ -183,7 +187,7 @@ class ScheduleMetrics {
   unsigned BubbleCycles;
 
 public:
-  ScheduleMetrics() {}
+  ScheduleMetrics() = default;
   ScheduleMetrics(unsigned L, unsigned BC)
       : ScheduleLength(L), BubbleCycles(BC) {}
   unsigned getLength() const { return ScheduleLength; }
@@ -198,8 +202,7 @@ public:
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
-  dbgs() << "\n Schedule Metric (scaled by "
-         << ScheduleMetrics::ScaleFactor
+  dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor
          << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
          << Sm.getLength() << " ]\n";
   return OS;
@@ -217,7 +220,7 @@ class RegionPressureMap {
   bool IsLiveOut;
 
 public:
-  RegionPressureMap() {}
+  RegionPressureMap() = default;
   RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
       : DAG(GCNDAG), IsLiveOut(LiveOut) {}
   // Build the Instr->LiveReg and RegionIdx->Instr maps
@@ -239,6 +242,7 @@ using RegionBoundaries =
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
+  friend class RewriteMFMAFormStage;
   friend class UnclusteredHighRPStage;
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
@@ -300,18 +304,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // Compute and cache live-ins and pressure for all regions in block.
   void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
 
-  /// If necessary, updates a region's boundaries following insertion ( \p NewMI
-  /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
-  /// For an MI removal, this must be called before the MI is actually erased
-  /// from its parent MBB.
-  void updateRegionBoundaries(RegionBoundaries &RegionBounds,
-                              MachineBasicBlock::iterator MI,
-                              MachineInstr *NewMI);
+  /// Makes the scheduler try to achieve an occupancy of \p TargetOccupancy.
+  void setTargetOccupancy(unsigned TargetOccupancy);
 
   void runSchedStages();
 
   std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
 
+  void deleteMI(unsigned RegionIdx, MachineInstr *MI);
+
 public:
   GCNScheduleDAGMILive(MachineSchedContext *C,
                        std::unique_ptr<MachineSchedStrategy> S);
@@ -367,12 +368,12 @@ public:
   // be skipped.
   virtual bool initGCNRegion();
 
+  // Finalize state after scheduling a region.
+  virtual void finalizeGCNRegion();
+
   // Track whether a new region is also a new MBB.
   void setupNewBlock();
 
-  // Finalize state after scheudling a region.
-  void finalizeGCNRegion();
-
   // Check result of scheduling.
   void checkScheduling();
 
@@ -397,8 +398,12 @@ public:
   // Returns true if the new schedule may result in more spilling.
   bool mayCauseSpilling(unsigned WavesAfter);
 
-  // Attempt to revert scheduling for this region.
-  void revertScheduling();
+  /// Sets the schedule of region \p RegionIdx in block \p MBB to \p MIOrder.
+  /// The MIs in \p MIOrder must be exactly the same as the ones currently
+  /// existing inside the region, only in a different order that honors def-use
+  /// chains.
+  void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB,
+                            ArrayRef<MachineInstr *> MIOrder);
 
   void advanceRegion() { RegionIdx++; }
 
@@ -413,10 +418,67 @@ public:
       : GCNSchedStage(StageID, DAG) {}
 };
 
+class RewriteMFMAFormStage : public GCNSchedStage {
+private:
+  // Record regions with excess archvgpr register pressure over the physical
+  // register limit. Register pressure in these regions usually will result in
+  // spilling.
+  BitVector RegionsWithExcessArchVGPR;
+
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *SRI;
+
+  /// Do a speculative rewrite and collect copy locations. The speculative
+  /// rewrite allows us to calculate the RP of the code after the rewrite, and
+  /// the copy locations allow us to calculate the total cost of copies required
+  /// for the rewrite. Stores the rewritten instructions in \p RewriteCands ,
+  /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the
+  /// copy locations for defs (of the MFMA operands) in \p CopyForDef
+  bool
+  initHeuristics(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+                 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+                 SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+  /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
+  /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
+  /// costs, and \p RewriteCands to undo rewriting.
+  int64_t getRewriteCost(
+      const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+      const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+      const SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+  /// Do the final rewrite on \p RewriteCands and insert any needed copies.
+  bool
+  rewrite(const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
+
+  /// \returns true if this MI is a rewrite candidate.
+  bool isRewriteCandidate(MachineInstr *MI) const;
+
+  /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p
+  /// DefIdxs
+  void findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
+                        SmallVectorImpl<SlotIndex> &DefIdxs);
+
+  /// Finds all the reaching uses of \p DefMI and stores the use operands in \p
+  /// ReachingUses
+  void findReachingUses(MachineInstr *DefMI, LiveIntervals *LIS,
+                        SmallVectorImpl<MachineOperand *> &ReachingUses);
+
+public:
+  bool initGCNSchedStage() override;
+
+  RewriteMFMAFormStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
 class UnclusteredHighRPStage : public GCNSchedStage {
 private:
   // Save the initial occupancy before starting this stage.
   unsigned InitialOccupancy;
+  // Save the temporary target occupancy before starting this stage.
+  unsigned TempTargetOccupancy;
+  // Track whether any region was scheduled by this stage.
+  bool IsAnyRegionScheduled;
 
 public:
   bool initGCNSchedStage() override;
@@ -447,65 +509,242 @@ public:
 };
 
 /// Attempts to reduce function spilling or, if there is no spilling, to
-/// increase function occupancy by one with respect to ArchVGPR usage by sinking
-/// rematerializable instructions to their use. When the stage
-/// estimates reducing spilling or increasing occupancy is possible, as few
-/// instructions as possible are rematerialized to reduce potential negative
+/// increase function occupancy by one with respect to register usage by sinking
+/// rematerializable instructions to their use. When the stage estimates that
+/// reducing spilling or increasing occupancy is possible, it tries to
+/// rematerialize as few registers as possible to reduce potential negative
 /// effects on function latency.
+///
+/// The stage only supports rematerializing registers that meet all of the
+/// following constraints.
+/// 1. The register is virtual and has a single defining instruction.
+/// 2. The single defining instruction is either deemed rematerializable by the
+///    target-independent logic, or if not, has no non-constant and
+///    non-ignorable physical register use.
+/// 3  The register has no virtual register use whose live range would be
+///    extended by the rematerialization.
+/// 4. The register has a single non-debug user in a different region from its
+///    defining region.
+/// 5. The register is not used by or using another register that is going to be
+///    rematerialized.
 class PreRARematStage : public GCNSchedStage {
 private:
-  /// Useful information about a rematerializable instruction.
-  struct RematInstruction {
-    /// Single use of the rematerializable instruction's defined register,
-    /// located in a different block.
+  /// A rematerializable register.
+  struct RematReg {
+    /// Single MI defining the rematerializable register.
+    MachineInstr *DefMI;
+    /// Single user of the rematerializable register.
     MachineInstr *UseMI;
-    /// Rematerialized version of \p DefMI, set in
-    /// PreRARematStage::rematerialize. Used for reverting rematerializations.
-    MachineInstr *RematMI;
-    /// Set of regions in which the rematerializable instruction's defined
-    /// register is a live-in.
-    SmallDenseSet<unsigned, 4> LiveInRegions;
+    /// Regions in which the register is live-in/live-out/live anywhere.
+    BitVector LiveIn, LiveOut, Live;
+    /// The rematerializable register's lane bitmask.
+    LaneBitmask Mask;
+    /// Defining and using regions.
+    unsigned DefRegion, UseRegion;
+
+    RematReg(MachineInstr *DefMI, MachineInstr *UseMI,
+             GCNScheduleDAGMILive &DAG,
+             const DenseMap<MachineInstr *, unsigned> &MIRegion);
+
+    /// Returns the rematerializable register. Do not call after deleting the
+    /// original defining instruction.
+    Register getReg() const { return DefMI->getOperand(0).getReg(); }
+
+    /// Determines whether this rematerialization may be beneficial in at least
+    /// one target region.
+    bool maybeBeneficial(const BitVector &TargetRegions,
+                         ArrayRef<GCNRPTarget> RPTargets) const;
+
+    /// Determines if the register is both unused and live-through in region \p
+    /// I. This guarantees that rematerializing it will reduce RP in the region.
+    bool isUnusedLiveThrough(unsigned I) const {
+      assert(I < Live.size() && "region index out of range");
+      return LiveIn[I] && LiveOut[I] && I != UseRegion;
+    }
+
+    /// Updates internal structures following a MI rematerialization. Part of
+    /// the stage instead of the DAG because it makes assumptions that are
+    /// specific to the rematerialization process.
+    void insertMI(unsigned RegionIdx, MachineInstr *RematMI,
+                  GCNScheduleDAGMILive &DAG) const;
+  };
+
+  /// A scored rematerialization candidate. Higher scores indicate more
+  /// beneficial rematerializations. A null score indicate the rematerialization
+  /// is not helpful to reduce RP in target regions.
+  struct ScoredRemat {
+    /// The rematerializable register under consideration.
+    RematReg *Remat;
+
+    /// Execution frequency information required by scoring heuristics.
+    /// Frequencies are scaled down if they are high to avoid overflow/underflow
+    /// when combining them.
+    struct FreqInfo {
+      /// Per-region execution frequencies. 0 when unknown.
+      SmallVector<uint64_t> Regions;
+      /// Minimum and maximum observed frequencies.
+      uint64_t MinFreq, MaxFreq;
+
+      FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG);
+
+    private:
+      static const uint64_t ScaleFactor = 1024;
+    };
+
+    /// This only initializes state-independent characteristics of \p Remat, not
+    /// the actual score.
+    ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
+                const GCNScheduleDAGMILive &DAG);
+
+    /// Updates the rematerialization's score w.r.t. the current \p RPTargets.
+    /// \p RegionFreq indicates the frequency of each region
+    void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets,
+                const FreqInfo &Freq, bool ReduceSpill);
+
+    /// Returns whether the current score is null, indicating the
+    /// rematerialization is useless.
+    bool hasNullScore() const { return !RegionImpact; }
+
+    /// Compare score components of non-null scores pair-wise. A null score is
+    /// always strictly lesser than another non-null score.
+    bool operator<(const ScoredRemat &O) const {
+      if (hasNullScore())
+        return !O.hasNullScore();
+      if (O.hasNullScore())
+        return false;
+      if (MaxFreq != O.MaxFreq)
+        return MaxFreq < O.MaxFreq;
+      if (FreqDiff != O.FreqDiff)
+        return FreqDiff < O.FreqDiff;
+      if (RegionImpact != O.RegionImpact)
+        return RegionImpact < O.RegionImpact;
+      // Break ties using pointer to rematerializable register. Rematerializable
+      // registers are collected in instruction order so, within the same
+      // region, this will prefer registers defined earlier that have longer
+      // live ranges in their defining region (since the registers we consider
+      // are always live-out in their defining region).
+      return Remat > O.Remat;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    Printable print() const;
+#endif
+
+  private:
+    /// Number of 32-bit registers this rematerialization covers.
+    unsigned NumRegs;
+
+    // The three members below are the scoring components, top to bottom from
+    // most important to least important when comparing candidates.
 
-    RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
+    /// Frequency of impacted target region with highest known frequency. This
+    /// only matters when the stage is trying to reduce spilling, so it is
+    /// always 0 when it is not.
+    uint64_t MaxFreq;
+    /// Frequency difference between defining and using regions. Negative values
+    /// indicate we are rematerializing to higher frequency regions; positive
+    /// values indicate the contrary.
+    int64_t FreqDiff;
+    /// Expected number of target regions impacted by the rematerialization,
+    /// scaled by the size of the register being rematerialized.
+    unsigned RegionImpact;
+
+    unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;
+
+    int64_t getFreqDiff(const FreqInfo &Freq) const;
   };
 
-  /// Maps all MIs to their parent region. MI terminators are considered to be
-  /// outside the region they delimitate, and as such are not stored in the map.
-  DenseMap<MachineInstr *, unsigned> MIRegion;
   /// Parent MBB to each region, in region order.
   SmallVector<MachineBasicBlock *> RegionBB;
-  /// Collects instructions to rematerialize.
-  MapVector<MachineInstr *, RematInstruction> Rematerializations;
-  /// Collects regions whose live-ins or register pressure will change due to
-  /// rematerializations.
-  DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
-  /// In case we need to rollback rematerializations, save lane masks for all
-  /// rematerialized registers in all regions in which they are live-ins.
-  DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
-  /// After successful stage initialization, indicates which regions should be
-  /// rescheduled.
-  BitVector RescheduleRegions;
-  /// The target occupancy the stage is trying to achieve. Empty when the
+  /// Register pressure targets for all regions.
+  SmallVector<GCNRPTarget> RPTargets;
+  /// Regions which are above the stage's RP target.
+  BitVector TargetRegions;
+  /// The target occupancy the set is trying to achieve. Empty when the
   /// objective is spilling reduction.
   std::optional<unsigned> TargetOcc;
   /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
-  /// Smaller than or equal to the target occupancy.
   unsigned AchievedOcc;
+  /// After successful stage initialization, indicates which regions should be
+  /// rescheduled.
+  BitVector RescheduleRegions;
 
-  /// Returns whether remat can reduce spilling or increase function occupancy
-  /// by 1 through rematerialization. If it can do one, collects instructions in
-  /// PreRARematStage::Rematerializations and sets the target occupancy in
-  /// PreRARematStage::TargetOccupancy.
-  bool canIncreaseOccupancyOrReduceSpill();
+  /// List of rematerializable registers.
+  SmallVector<RematReg> RematRegs;
+
+  /// Holds enough information to rollback a rematerialization decision post
+  /// re-scheduling.
+  struct RollbackInfo {
+    /// The rematerializable register under consideration.
+    const RematReg *Remat;
+    /// The rematerialized MI replacing the original defining MI.
+    MachineInstr *RematMI;
+    /// Maps register machine operand indices to their original register.
+    SmallDenseMap<unsigned, Register, 4> RegMap;
+
+    RollbackInfo(const RematReg *Remat) : Remat(Remat) {}
+  };
+  /// List of rematerializations to rollback if rematerialization does not end
+  /// up being beneficial.
+  SmallVector<RollbackInfo> Rollbacks;
+
+  /// State of a region pre-re-scheduling but post-rematerializations that we
+  /// must keep to be able to revert re-scheduling effects.
+  struct RegionSchedRevert {
+    /// Region number;
+    unsigned RegionIdx;
+    /// Original instruction order (both debug and non-debug MIs).
+    std::vector<MachineInstr *> OrigMIOrder;
+    /// Maximum pressure recorded in the region.
+    GCNRegPressure MaxPressure;
+
+    RegionSchedRevert(unsigned RegionIdx, ArrayRef<MachineInstr *> OrigMIOrder,
+                      const GCNRegPressure &MaxPressure)
+        : RegionIdx(RegionIdx), OrigMIOrder(OrigMIOrder),
+          MaxPressure(MaxPressure) {}
+  };
+  /// After re-scheduling, contains pre-re-scheduling data for all re-scheduled
+  /// regions.
+  SmallVector<RegionSchedRevert> RegionReverts;
+
+  /// Returns the occupancy the stage is trying to achieve.
+  unsigned getStageTargetOccupancy() const;
+
+  /// Determines the stage's objective (increasing occupancy or reducing
+  /// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to
+  /// achieve that objective and mark those that don't achieve it in \ref
+  /// TargetRegions. Returns whether there is any target region.
+  bool setObjective();
+
+  /// Unsets target regions in \p Regions whose RP target has been reached.
+  void unsetSatisifedRPTargets(const BitVector &Regions);
+
+  /// Fully recomputes RP from the DAG in \p Regions. Among those regions, sets
+  /// again all \ref TargetRegions that were optimistically marked as satisfied
+  /// but are actually not, and returns whether there were any such regions.
+  bool updateAndVerifyRPTargets(const BitVector &Regions);
+
+  /// Collects all rematerializable registers and appends them to \ref
+  /// RematRegs. \p MIRegion maps MIs to their region. Returns whether any
+  /// rematerializable register was found.
+  bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion);
+
+  /// Rematerializes \p Remat. This removes the rematerialized register from
+  /// live-in/out lists in the DAG and updates RP targets in all affected
+  /// regions, which are also marked in \ref RescheduleRegions. Regions in which
+  /// RP savings are not guaranteed are set in \p RecomputeRP. When \p Rollback
+  /// is non-null, fills it with required information to be able to rollback the
+  /// rematerialization post-rescheduling.
+  void rematerialize(const RematReg &Remat, BitVector &RecomputeRP,
+                     RollbackInfo *Rollback);
+
+  /// Deletes all rematerialized MIs from the MIR when they were kept around for
+  /// potential rollback.
+  void commitRematerializations() const;
 
   /// Whether the MI is rematerializable
   bool isReMaterializable(const MachineInstr &MI);
 
-  /// Rematerializes all instructions in PreRARematStage::Rematerializations
-  /// and stores the achieved occupancy after remat in
-  /// PreRARematStage::AchievedOcc.
-  void rematerialize();
-
   /// If remat alone did not increase occupancy to the target one, rollbacks all
   /// rematerializations and resets live-ins/RP in all regions impacted by the
   /// stage to their pre-stage values.
@@ -516,10 +755,17 @@ public:
 
   bool initGCNRegion() override;
 
+  void finalizeGCNRegion() override;
+
   bool shouldRevertScheduling(unsigned WavesAfter) override;
 
   PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
-      : GCNSchedStage(StageID, DAG), RescheduleRegions(DAG.Regions.size()) {}
+      : GCNSchedStage(StageID, DAG), TargetRegions(DAG.Regions.size()),
+        RescheduleRegions(DAG.Regions.size()) {
+    const unsigned NumRegions = DAG.Regions.size();
+    RPTargets.reserve(NumRegions);
+    RegionBB.reserve(NumRegions);
+  }
 };
 
 class ILPInitialScheduleStage : public GCNSchedStage {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index f291e37..da63628 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -119,15 +119,15 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // that do not support ADDR64 variants of MUBUF instructions. Such targets
   // cannot use a 64 bit offset with a MUBUF instruction to access the global
   // address space
-  if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
-    ToggleFeature(AMDGPU::FeatureFlatForGlobal);
-    FlatForGlobal = true;
+  if (!hasAddr64() && !FS.contains("flat-for-global") && !UseFlatForGlobal) {
+    ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
+    UseFlatForGlobal = true;
   }
   // Unless +-flat-for-global is specified, use MUBUF instructions for global
   // address space access if flat operations are not available.
-  if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
-    ToggleFeature(AMDGPU::FeatureFlatForGlobal);
-    FlatForGlobal = false;
+  if (!hasFlat() && !FS.contains("flat-for-global") && UseFlatForGlobal) {
+    ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
+    UseFlatForGlobal = false;
   }
 
   // Set defaults if needed.
@@ -169,7 +169,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     : // clang-format off
     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
     AMDGPUSubtarget(TT),
-    TargetTriple(TT),
     TargetID(*this),
     InstrItins(getInstrItineraryForCPU(GPU)),
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
@@ -645,6 +644,8 @@ void GCNSubtarget::adjustSchedDependency(
     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
     unsigned Lat = 0;
     for (++I; I != E && I->isBundledWithPred(); ++I) {
+      if (I->isMetaInstruction())
+        continue;
       if (I->modifiesRegister(Reg, TRI))
         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
       else if (Lat)
@@ -658,6 +659,8 @@ void GCNSubtarget::adjustSchedDependency(
     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
+      if (I->isMetaInstruction())
+        continue;
       if (I->readsRegister(Reg, TRI))
         break;
       --Lat;
@@ -699,7 +702,7 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
     KernargSegmentPtr = true;
 
   bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
-  if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
+  if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
     PrivateSegmentBuffer = true;
   else if (ST.isMesaGfxShader(F))
     ImplicitBufferPtr = true;
@@ -717,13 +720,13 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
   }
 
   if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
-      (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
-      // FlatScratchInit cannot be true for graphics CC if enableFlatScratch()
-      // is false.
-      (ST.enableFlatScratch() ||
+      (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
+      // FlatScratchInit cannot be true for graphics CC if
+      // hasFlatScratchEnabled() is false.
+      (ST.hasFlatScratchEnabled() ||
        (!AMDGPU::isGraphics(CC) &&
         !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
-      !ST.flatScratchIsArchitected()) {
+      !ST.hasArchitectedFlatScratch()) {
     FlatScratchInit = true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c2e6078..b308e0d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -60,238 +60,25 @@ private:
 
 protected:
   // Basic subtarget description.
-  Triple TargetTriple;
   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
   unsigned Gen = INVALID;
   InstrItineraryData InstrItins;
   int LDSBankCount = 0;
   unsigned MaxPrivateElementSize = 0;
 
-  // Possibly statically set by tablegen, but may want to be overridden.
-  bool FastDenormalF32 = false;
-  bool HalfRate64Ops = false;
-  bool FullRate64Ops = false;
-
   // Dynamically set bits that enable features.
-  bool FlatForGlobal = false;
-  bool AutoWaitcntBeforeBarrier = false;
-  bool BackOffBarrier = false;
-  bool UnalignedScratchAccess = false;
-  bool UnalignedAccessMode = false;
-  bool RelaxedBufferOOBMode = false;
-  bool HasApertureRegs = false;
-  bool SupportsXNACK = false;
-  bool KernargPreload = false;
-
-  // This should not be used directly. 'TargetID' tracks the dynamic settings
-  // for XNACK.
-  bool EnableXNACK = false;
-
-  bool EnableTgSplit = false;
-  bool EnableCuMode = false;
-  bool TrapHandler = false;
-  bool EnablePreciseMemory = false;
-
-  // Used as options.
-  bool EnableLoadStoreOpt = false;
-  bool EnableUnsafeDSOffsetFolding = false;
-  bool EnableSIScheduler = false;
-  bool EnableDS128 = false;
-  bool EnablePRTStrictNull = false;
-  bool DumpCode = false;
-  bool AssemblerPermissiveWavesize = false;
-
-  // Subtarget statically properties set by tablegen
-  bool FP64 = false;
-  bool FMA = false;
-  bool MIMG_R128 = false;
-  bool CIInsts = false;
-  bool GFX8Insts = false;
-  bool GFX9Insts = false;
-  bool GFX90AInsts = false;
-  bool GFX940Insts = false;
-  bool GFX950Insts = false;
-  bool GFX10Insts = false;
-  bool GFX11Insts = false;
-  bool GFX12Insts = false;
-  bool GFX1250Insts = false;
-  bool GFX10_3Insts = false;
-  bool GFX7GFX8GFX9Insts = false;
-  bool SGPRInitBug = false;
-  bool UserSGPRInit16Bug = false;
-  bool NegativeScratchOffsetBug = false;
-  bool NegativeUnalignedScratchOffsetBug = false;
-  bool HasSMemRealTime = false;
-  bool HasIntClamp = false;
-  bool HasFmaMixInsts = false;
-  bool HasFmaMixBF16Insts = false;
-  bool HasMovrel = false;
-  bool HasVGPRIndexMode = false;
-  bool HasScalarDwordx3Loads = false;
-  bool HasScalarStores = false;
-  bool HasScalarAtomics = false;
-  bool HasSDWAOmod = false;
-  bool HasSDWAScalar = false;
-  bool HasSDWASdst = false;
-  bool HasSDWAMac = false;
-  bool HasSDWAOutModsVOPC = false;
-  bool HasDPP = false;
-  bool HasDPP8 = false;
-  bool HasDPALU_DPP = false;
-  bool HasDPPSrc1SGPR = false;
-  bool HasPackedFP32Ops = false;
-  bool HasImageInsts = false;
-  bool HasExtendedImageInsts = false;
-  bool HasR128A16 = false;
-  bool HasA16 = false;
-  bool HasG16 = false;
-  bool HasNSAEncoding = false;
-  bool HasPartialNSAEncoding = false;
-  bool GFX10_AEncoding = false;
-  bool GFX10_BEncoding = false;
-  bool HasDLInsts = false;
-  bool HasFmacF64Inst = false;
-  bool HasDot1Insts = false;
-  bool HasDot2Insts = false;
-  bool HasDot3Insts = false;
-  bool HasDot4Insts = false;
-  bool HasDot5Insts = false;
-  bool HasDot6Insts = false;
-  bool HasDot7Insts = false;
-  bool HasDot8Insts = false;
-  bool HasDot9Insts = false;
-  bool HasDot10Insts = false;
-  bool HasDot11Insts = false;
-  bool HasDot12Insts = false;
-  bool HasDot13Insts = false;
-  bool HasMAIInsts = false;
-  bool HasFP8Insts = false;
-  bool HasFP8ConversionInsts = false;
-  bool HasFP8E5M3Insts = false;
-  bool HasCvtFP8Vop1Bug = false;
-  bool HasPkFmacF16Inst = false;
-  bool HasAtomicFMinFMaxF32GlobalInsts = false;
-  bool HasAtomicFMinFMaxF64GlobalInsts = false;
-  bool HasAtomicFMinFMaxF32FlatInsts = false;
-  bool HasAtomicFMinFMaxF64FlatInsts = false;
-  bool HasAtomicDsPkAdd16Insts = false;
-  bool HasAtomicFlatPkAdd16Insts = false;
-  bool HasAtomicFaddRtnInsts = false;
-  bool HasAtomicFaddNoRtnInsts = false;
-  bool HasMemoryAtomicFaddF32DenormalSupport = false;
-  bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
-  bool HasAtomicBufferGlobalPkAddF16Insts = false;
-  bool HasAtomicCSubNoRtnInsts = false;
-  bool HasAtomicGlobalPkAddBF16Inst = false;
-  bool HasAtomicBufferPkAddBF16Inst = false;
-  bool HasFlatAtomicFaddF32Inst = false;
-  bool HasFlatBufferGlobalAtomicFaddF64Inst = false;
-  bool HasDefaultComponentZero = false;
-  bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
-  bool HasEmulatedSystemScopeAtomics = false;
-  bool HasDefaultComponentBroadcast = false;
-  bool HasXF32Insts = false;
+  bool DynamicVGPR = false;
+  bool DynamicVGPRBlockSize32 = false;
+  bool ScalarizeGlobal = false;
+
   /// The maximum number of instructions that may be placed within an S_CLAUSE,
   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
   /// indicates a lack of S_CLAUSE support.
   unsigned MaxHardClauseLength = 0;
-  bool SupportsSRAMECC = false;
-  bool DynamicVGPR = false;
-  bool DynamicVGPRBlockSize32 = false;
-  bool HasVMemToLDSLoad = false;
-  bool RequiresAlignVGPR = false;
-
-  // This should not be used directly. 'TargetID' tracks the dynamic settings
-  // for SRAMECC.
-  bool EnableSRAMECC = false;
-
-  bool HasNoSdstCMPX = false;
-  bool HasVscnt = false;
-  bool HasWaitXcnt = false;
-  bool HasGetWaveIdInst = false;
-  bool HasSMemTimeInst = false;
-  bool HasShaderCyclesRegister = false;
-  bool HasShaderCyclesHiLoRegisters = false;
-  bool HasVOP3Literal = false;
-  bool HasNoDataDepHazard = false;
-  bool FlatAddressSpace = false;
-  bool FlatInstOffsets = false;
-  bool FlatGlobalInsts = false;
-  bool FlatScratchInsts = false;
-  bool FlatGVSMode = false;
-  bool ScalarFlatScratchInsts = false;
-  bool HasArchitectedFlatScratch = false;
-  bool EnableFlatScratch = false;
-  bool HasArchitectedSGPRs = false;
-  bool HasGDS = false;
-  bool HasGWS = false;
-  bool AddNoCarryInsts = false;
-  bool HasUnpackedD16VMem = false;
-  bool LDSMisalignedBug = false;
-  bool HasMFMAInlineLiteralBug = false;
-  bool UnalignedBufferAccess = false;
-  bool UnalignedDSAccess = false;
-  bool HasPackedTID = false;
-  bool ScalarizeGlobal = false;
-  bool HasSALUFloatInsts = false;
-  bool HasPseudoScalarTrans = false;
-  bool HasRestrictedSOffset = false;
-  bool Has64BitLiterals = false;
-  bool Has1024AddressableVGPRs = false;
-  bool HasBitOp3Insts = false;
-  bool HasTanhInsts = false;
-  bool HasTensorCvtLutInsts = false;
-  bool HasTransposeLoadF4F6Insts = false;
-  bool HasPrngInst = false;
-  bool HasBVHDualAndBVH8Insts = false;
-  bool HasPermlane16Swap = false;
-  bool HasPermlane32Swap = false;
-  bool HasVcmpxPermlaneHazard = false;
-  bool HasVMEMtoScalarWriteHazard = false;
-  bool HasSMEMtoVectorWriteHazard = false;
-  bool HasInstFwdPrefetchBug = false;
-  bool HasVmemPrefInsts = false;
-  bool HasSafeSmemPrefetch = false;
-  bool HasSafeCUPrefetch = false;
-  bool HasVcmpxExecWARHazard = false;
-  bool HasLdsBranchVmemWARHazard = false;
-  bool HasNSAtoVMEMBug = false;
-  bool HasNSAClauseBug = false;
-  bool HasOffset3fBug = false;
-  bool HasFlatSegmentOffsetBug = false;
-  bool HasImageStoreD16Bug = false;
-  bool HasImageGather4D16Bug = false;
-  bool HasMSAALoadDstSelBug = false;
-  bool HasPrivEnabledTrap2NopBug = false;
-  bool Has1_5xVGPRs = false;
-  bool HasMADIntraFwdBug = false;
-  bool HasVOPDInsts = false;
-  bool HasVALUTransUseHazard = false;
-  bool HasRequiredExportPriority = false;
-  bool HasVmemWriteVgprInOrder = false;
-  bool HasAshrPkInsts = false;
-  bool HasIEEEMinimumMaximumInsts = false;
-  bool HasMinimum3Maximum3F32 = false;
-  bool HasMinimum3Maximum3F16 = false;
-  bool HasMin3Max3PKF16 = false;
-  bool HasMinimum3Maximum3PKF16 = false;
-  bool HasLshlAddU64Inst = false;
-  bool HasAddSubU64Insts = false;
-  bool HasMadU32Inst = false;
-  bool HasPointSampleAccel = false;
-  bool HasLdsBarrierArriveAtomic = false;
-  bool HasSetPrioIncWgInst = false;
-
-  bool RequiresCOV6 = false;
-  bool UseBlockVGPROpsForCSR = false;
-  bool HasGloballyAddressableScratch = false;
-
-  bool Has45BitNumRecordsBufferResource = false;
-
-  bool HasClusters = false;
-
-  // Dummy feature to use for assembler in tablegen.
-  bool FeatureDisable = false;
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool ATTRIBUTE = DEFAULT;
+#include "AMDGPUGenSubtargetInfo.inc"
 
 private:
   SIInstrInfo InstrInfo;
@@ -303,24 +90,20 @@ public:
                const GCNTargetMachine &TM);
   ~GCNSubtarget() override;
 
-  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
-                                                   StringRef GPU, StringRef FS);
+  GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU,
+                                                StringRef FS);
 
   /// Diagnose inconsistent subtarget features before attempting to codegen
   /// function \p F.
   void checkSubtargetFeatures(const Function &F) const;
 
-  const SIInstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
-  }
+  const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
 
   const SIFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
 
-  const SITargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
+  const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
 
   const SIRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
@@ -358,9 +141,13 @@ public:
 
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
-  Generation getGeneration() const {
-    return (Generation)Gen;
-  }
+  Generation getGeneration() const { return (Generation)Gen; }
+
+  bool isGFX11Plus() const { return getGeneration() >= GFX11; }
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool GETTER() const override { return ATTRIBUTE; }
+#include "AMDGPUGenSubtargetInfo.inc"
 
   unsigned getMaxWaveScratchSize() const {
     // See COMPUTE_TMPRING_SIZE.WAVESIZE.
@@ -381,12 +168,11 @@ public:
     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
   }
 
-  int getLDSBankCount() const {
-    return LDSBankCount;
-  }
+  int getLDSBankCount() const { return LDSBankCount; }
 
   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
-    return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
+    return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
+                                                       : 16;
   }
 
   unsigned getConstantBusLimit(unsigned Opcode) const;
@@ -397,34 +183,12 @@ public:
   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
 
   bool supportsWGP() const {
-    if (GFX1250Insts)
+    if (HasGFX1250Insts)
       return false;
     return getGeneration() >= GFX10;
   }
 
-  bool hasIntClamp() const {
-    return HasIntClamp;
-  }
-
-  bool hasFP64() const {
-    return FP64;
-  }
-
-  bool hasMIMG_R128() const {
-    return MIMG_R128;
-  }
-
-  bool hasHWFP64() const {
-    return FP64;
-  }
-
-  bool hasHalfRate64Ops() const {
-    return HalfRate64Ops;
-  }
-
-  bool hasFullRate64Ops() const {
-    return FullRate64Ops;
-  }
+  bool hasHWFP64() const { return HasFP64; }
 
   bool hasAddr64() const {
     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
@@ -440,67 +204,19 @@ public:
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
-  bool hasFractBug() const {
-    return getGeneration() == SOUTHERN_ISLANDS;
-  }
-
-  bool hasBFE() const {
-    return true;
-  }
-
-  bool hasBFI() const {
-    return true;
-  }
-
-  bool hasBFM() const {
-    return hasBFE();
-  }
-
-  bool hasBCNT(unsigned Size) const {
-    return true;
-  }
+  bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
 
-  bool hasFFBL() const {
-    return true;
-  }
-
-  bool hasFFBH() const {
-    return true;
-  }
-
-  bool hasMed3_16() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX9;
-  }
+  bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
 
   bool hasMin3Max3_16() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  bool hasFmaMixInsts() const {
-    return HasFmaMixInsts;
-  }
+  bool hasSwap() const { return HasGFX9Insts; }
 
-  bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+  bool hasScalarPackInsts() const { return HasGFX9Insts; }
 
-  bool hasCARRY() const {
-    return true;
-  }
-
-  bool hasFMA() const {
-    return FMA;
-  }
-
-  bool hasSwap() const {
-    return GFX9Insts;
-  }
-
-  bool hasScalarPackInsts() const {
-    return GFX9Insts;
-  }
-
-  bool hasScalarMulHiInsts() const {
-    return GFX9Insts;
-  }
+  bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
 
   bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
 
@@ -515,9 +231,7 @@ public:
 
   /// True if the offset field of DS instructions works as expected. On SI, the
   /// offset uses a 16-bit adder and does not always wrap properly.
-  bool hasUsableDSOffset() const {
-    return getGeneration() >= SEA_ISLANDS;
-  }
+  bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
 
   bool unsafeDSOffsetFoldingEnabled() const {
     return EnableUnsafeDSOffsetFolding;
@@ -530,14 +244,10 @@ public:
 
   /// Extra wait hazard is needed in some cases before
   /// s_cbranch_vccnz/s_cbranch_vccz.
-  bool hasReadVCCZBug() const {
-    return getGeneration() <= SEA_ISLANDS;
-  }
+  bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
 
   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
-  bool partialVCCWritesUpdateVCCZ() const {
-    return getGeneration() >= GFX10;
-  }
+  bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
 
   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
   /// was written by a VALU instruction.
@@ -551,19 +261,13 @@ public:
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
-  bool hasRFEHazards() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
+  bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
 
   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
   unsigned getSetRegWaitStates() const {
     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
   }
 
-  bool dumpCode() const {
-    return DumpCode;
-  }
-
   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
@@ -578,25 +282,15 @@ public:
     return getGeneration() >= AMDGPUSubtarget::GFX10;
   }
 
-  bool useFlatForGlobal() const {
-    return FlatForGlobal;
-  }
-
   /// \returns If target supports ds_read/write_b128 and user enables generation
   /// of ds_read/write_b128.
-  bool useDS128() const {
-    return CIInsts && EnableDS128;
-  }
+  bool useDS128() const { return HasCIInsts && EnableDS128; }
 
   /// \return If target supports ds_read/write_b96/128.
-  bool hasDS96AndDS128() const {
-    return CIInsts;
-  }
+  bool hasDS96AndDS128() const { return HasCIInsts; }
 
   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
-  bool haveRoundOpsF64() const {
-    return CIInsts;
-  }
+  bool haveRoundOpsF64() const { return HasCIInsts; }
 
   /// \returns If MUBUF instructions always perform range checking, even for
   /// buffer resources used for private memory access.
@@ -606,91 +300,29 @@ public:
 
   /// \returns If target requires PRT Struct NULL support (zero result registers
   /// for sparse texture support).
-  bool usePRTStrictNull() const {
-    return EnablePRTStrictNull;
-  }
-
-  bool hasAutoWaitcntBeforeBarrier() const {
-    return AutoWaitcntBeforeBarrier;
-  }
-
-  /// \returns true if the target supports backing off of s_barrier instructions
-  /// when an exception is raised.
-  bool supportsBackOffBarrier() const {
-    return BackOffBarrier;
-  }
-
-  bool hasUnalignedBufferAccess() const {
-    return UnalignedBufferAccess;
-  }
+  bool usePRTStrictNull() const { return EnablePRTStrictNull; }
 
   bool hasUnalignedBufferAccessEnabled() const {
-    return UnalignedBufferAccess && UnalignedAccessMode;
-  }
-
-  bool hasUnalignedDSAccess() const {
-    return UnalignedDSAccess;
+    return HasUnalignedBufferAccess && HasUnalignedAccessMode;
   }
 
   bool hasUnalignedDSAccessEnabled() const {
-    return UnalignedDSAccess && UnalignedAccessMode;
-  }
-
-  bool hasUnalignedScratchAccess() const {
-    return UnalignedScratchAccess;
+    return HasUnalignedDSAccess && HasUnalignedAccessMode;
   }
 
   bool hasUnalignedScratchAccessEnabled() const {
-    return UnalignedScratchAccess && UnalignedAccessMode;
+    return HasUnalignedScratchAccess && HasUnalignedAccessMode;
   }
 
-  bool hasUnalignedAccessMode() const {
-    return UnalignedAccessMode;
-  }
-
-  bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
-
-  bool hasApertureRegs() const {
-    return HasApertureRegs;
-  }
+  bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
 
-  bool isTrapHandlerEnabled() const {
-    return TrapHandler;
-  }
+  bool isTgSplitEnabled() const { return EnableTgSplit; }
 
-  bool isXNACKEnabled() const {
-    return TargetID.isXnackOnOrAny();
-  }
-
-  bool isTgSplitEnabled() const {
-    return EnableTgSplit;
-  }
-
-  bool isCuModeEnabled() const {
-    return EnableCuMode;
-  }
+  bool isCuModeEnabled() const { return EnableCuMode; }
 
   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
 
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
-  }
-
-  bool hasFlatScrRegister() const {
-    return hasFlatAddressSpace();
-  }
-
-  bool hasFlatInstOffsets() const {
-    return FlatInstOffsets;
-  }
-
-  bool hasFlatGlobalInsts() const {
-    return FlatGlobalInsts;
-  }
-
-  bool hasFlatScratchInsts() const {
-    return FlatScratchInsts;
-  }
+  bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
 
   // Check if target supports ST addressing mode with FLAT scratch instructions.
   // The ST addressing mode means no registers are used, either VGPR or SGPR,
@@ -699,24 +331,16 @@ public:
     return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
   }
 
-  bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
+  bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
 
-  bool hasScalarFlatScratchInsts() const {
-    return ScalarFlatScratchInsts;
-  }
-
-  bool enableFlatScratch() const {
-    return flatScratchIsArchitected() ||
+  bool hasFlatScratchEnabled() const {
+    return hasArchitectedFlatScratch() ||
            (EnableFlatScratch && hasFlatScratchInsts());
   }
 
-  bool hasGlobalAddTidInsts() const {
-    return GFX10_BEncoding;
-  }
+  bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
 
-  bool hasAtomicCSub() const {
-    return GFX10_BEncoding;
-  }
+  bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
 
   bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
 
@@ -726,7 +350,9 @@ public:
     return !hasGFX940Insts() && !hasGFX1250Insts();
   }
 
-  bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
+  bool hasVINTERPEncoding() const {
+    return HasGFX11Insts && !hasGFX1250Insts();
+  }
 
   // DS_ADD_F64/DS_ADD_RTN_F64
   bool hasLdsAtomicAddF64() const {
@@ -737,274 +363,45 @@ public:
     return getGeneration() >= GFX9;
   }
 
-  bool hasFlatSegmentOffsetBug() const {
-    return HasFlatSegmentOffsetBug;
-  }
+  bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
 
-  bool hasFlatLgkmVMemCountInOrder() const {
-    return getGeneration() > GFX9;
-  }
-
-  bool hasD16LoadStore() const {
-    return getGeneration() >= GFX9;
-  }
+  bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
 
   bool d16PreservesUnusedBits() const {
     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
   }
 
-  bool hasD16Images() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
+  bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
 
   /// Return if most LDS instructions have an m0 use that require m0 to be
   /// initialized.
-  bool ldsRequiresM0Init() const {
-    return getGeneration() < GFX9;
-  }
+  bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
 
   // True if the hardware rewinds and replays GWS operations if a wave is
   // preempted.
   //
   // If this is false, a GWS operation requires testing if a nack set the
   // MEM_VIOL bit, and repeating if so.
-  bool hasGWSAutoReplay() const {
-    return getGeneration() >= GFX9;
-  }
+  bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
 
   /// \returns if target has ds_gws_sema_release_all instruction.
-  bool hasGWSSemaReleaseAll() const {
-    return CIInsts;
-  }
-
-  /// \returns true if the target has integer add/sub instructions that do not
-  /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
-  /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
-  /// for saturation.
-  bool hasAddNoCarry() const {
-    return AddNoCarryInsts;
-  }
+  bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
 
   bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
 
   bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
 
-  bool hasUnpackedD16VMem() const {
-    return HasUnpackedD16VMem;
-  }
-
   // Covers VS/PS/CS graphics shaders
   bool isMesaGfxShader(const Function &F) const {
     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
   }
 
-  bool hasMad64_32() const {
-    return getGeneration() >= SEA_ISLANDS;
-  }
-
-  bool hasSDWAOmod() const {
-    return HasSDWAOmod;
-  }
-
-  bool hasSDWAScalar() const {
-    return HasSDWAScalar;
-  }
-
-  bool hasSDWASdst() const {
-    return HasSDWASdst;
-  }
-
-  bool hasSDWAMac() const {
-    return HasSDWAMac;
-  }
-
-  bool hasSDWAOutModsVOPC() const {
-    return HasSDWAOutModsVOPC;
-  }
-
-  bool hasDLInsts() const {
-    return HasDLInsts;
-  }
-
-  bool hasFmacF64Inst() const { return HasFmacF64Inst; }
-
-  bool hasDot1Insts() const {
-    return HasDot1Insts;
-  }
-
-  bool hasDot2Insts() const {
-    return HasDot2Insts;
-  }
-
-  bool hasDot3Insts() const {
-    return HasDot3Insts;
-  }
-
-  bool hasDot4Insts() const {
-    return HasDot4Insts;
-  }
-
-  bool hasDot5Insts() const {
-    return HasDot5Insts;
-  }
-
-  bool hasDot6Insts() const {
-    return HasDot6Insts;
-  }
-
-  bool hasDot7Insts() const {
-    return HasDot7Insts;
-  }
-
-  bool hasDot8Insts() const {
-    return HasDot8Insts;
-  }
-
-  bool hasDot9Insts() const {
-    return HasDot9Insts;
-  }
-
-  bool hasDot10Insts() const {
-    return HasDot10Insts;
-  }
-
-  bool hasDot11Insts() const {
-    return HasDot11Insts;
-  }
-
-  bool hasDot12Insts() const {
-    return HasDot12Insts;
-  }
-
-  bool hasDot13Insts() const {
-    return HasDot13Insts;
-  }
-
-  bool hasMAIInsts() const {
-    return HasMAIInsts;
-  }
-
-  bool hasFP8Insts() const {
-    return HasFP8Insts;
-  }
-
-  bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
-
-  bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
-
-  bool hasPkFmacF16Inst() const {
-    return HasPkFmacF16Inst;
-  }
-
-  bool hasAtomicFMinFMaxF32GlobalInsts() const {
-    return HasAtomicFMinFMaxF32GlobalInsts;
-  }
-
-  bool hasAtomicFMinFMaxF64GlobalInsts() const {
-    return HasAtomicFMinFMaxF64GlobalInsts;
-  }
-
-  bool hasAtomicFMinFMaxF32FlatInsts() const {
-    return HasAtomicFMinFMaxF32FlatInsts;
-  }
-
-  bool hasAtomicFMinFMaxF64FlatInsts() const {
-    return HasAtomicFMinFMaxF64FlatInsts;
-  }
-
-  bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
-
-  bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
+  bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
 
   bool hasAtomicFaddInsts() const {
     return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
   }
 
-  bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
-
-  bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
-
-  bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
-    return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
-  }
-
-  bool hasAtomicBufferGlobalPkAddF16Insts() const {
-    return HasAtomicBufferGlobalPkAddF16Insts;
-  }
-
-  bool hasAtomicGlobalPkAddBF16Inst() const {
-    return HasAtomicGlobalPkAddBF16Inst;
-  }
-
-  bool hasAtomicBufferPkAddBF16Inst() const {
-    return HasAtomicBufferPkAddBF16Inst;
-  }
-
-  bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
-
-  /// \return true if the target has flat, global, and buffer atomic fadd for
-  /// double.
-  bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
-    return HasFlatBufferGlobalAtomicFaddF64Inst;
-  }
-
-  /// \return true if the target's flat, global, and buffer atomic fadd for
-  /// float supports denormal handling.
-  bool hasMemoryAtomicFaddF32DenormalSupport() const {
-    return HasMemoryAtomicFaddF32DenormalSupport;
-  }
-
-  /// \return true if atomic operations targeting fine-grained memory work
-  /// correctly at device scope, in allocations in host or peer PCIe device
-  /// memory.
-  bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
-    return HasAgentScopeFineGrainedRemoteMemoryAtomics;
-  }
-
-  /// \return true is HW emulates system scope atomics unsupported by the PCI-e
-  /// via CAS loop.
-  bool hasEmulatedSystemScopeAtomics() const {
-    return HasEmulatedSystemScopeAtomics;
-  }
-
-  bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
-
-  bool hasDefaultComponentBroadcast() const {
-    return HasDefaultComponentBroadcast;
-  }
-
-  bool hasNoSdstCMPX() const {
-    return HasNoSdstCMPX;
-  }
-
-  bool hasVscnt() const {
-    return HasVscnt;
-  }
-
-  bool hasGetWaveIdInst() const {
-    return HasGetWaveIdInst;
-  }
-
-  bool hasSMemTimeInst() const {
-    return HasSMemTimeInst;
-  }
-
-  bool hasShaderCyclesRegister() const {
-    return HasShaderCyclesRegister;
-  }
-
-  bool hasShaderCyclesHiLoRegisters() const {
-    return HasShaderCyclesHiLoRegisters;
-  }
-
-  bool hasVOP3Literal() const {
-    return HasVOP3Literal;
-  }
-
-  bool hasNoDataDepHazard() const {
-    return HasNoDataDepHazard;
-  }
-
   bool vmemWriteNeedsExpWaitcnt() const {
     return getGeneration() < SEA_ISLANDS;
   }
@@ -1013,13 +410,7 @@ public:
     return getGeneration() == GFX10 || getGeneration() == GFX11;
   }
 
-  bool hasPrefetch() const { return GFX12Insts; }
-
-  bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
-
-  bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
-
-  bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+  bool hasPrefetch() const { return HasGFX12Insts; }
 
   // Has s_cmpk_* instructions.
   bool hasSCmpK() const { return getGeneration() < GFX12; }
@@ -1034,15 +425,11 @@ public:
   // dynamic realignment in common cases.
   Align getStackAlignment() const { return Align(16); }
 
-  bool enableMachineScheduler() const override {
-    return true;
-  }
+  bool enableMachineScheduler() const override { return true; }
 
   bool useAA() const override;
 
-  bool enableSubRegLiveness() const override {
-    return true;
-  }
+  bool enableSubRegLiveness() const override { return true; }
 
   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
@@ -1051,9 +438,7 @@ public:
   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 
   // XXX - Why is this here if it isn't in the default pass set?
-  bool enableEarlyIfConversion() const override {
-    return true;
-  }
+  bool enableEarlyIfConversion() const override { return true; }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            const SchedRegion &Region) const override;
@@ -1067,73 +452,35 @@ public:
     return AMDGPU::getMaxNumUserSGPRs(*this);
   }
 
-  bool hasSMemRealTime() const {
-    return HasSMemRealTime;
-  }
-
-  bool hasMovrel() const {
-    return HasMovrel;
-  }
-
-  bool hasVGPRIndexMode() const {
-    return HasVGPRIndexMode;
-  }
-
   bool useVGPRIndexMode() const;
 
   bool hasScalarCompareEq64() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
-  bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
-
-  bool hasScalarStores() const {
-    return HasScalarStores;
-  }
-
-  bool hasScalarAtomics() const {
-    return HasScalarAtomics;
+  bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
+  bool hasLDSFPAtomicAddF64() const {
+    return HasGFX90AInsts || HasGFX1250Insts;
   }
 
-  bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
-  bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
-
   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
 
   /// \returns true if the subtarget has the v_permlane64_b32 instruction.
   bool hasPermLane64() const { return getGeneration() >= GFX11; }
 
-  bool hasDPP() const {
-    return HasDPP;
-  }
-
-  bool hasDPPBroadcasts() const {
-    return HasDPP && getGeneration() < GFX10;
-  }
+  bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; }
 
   bool hasDPPWavefrontShifts() const {
     return HasDPP && getGeneration() < GFX10;
   }
 
-  bool hasDPP8() const {
-    return HasDPP8;
-  }
-
-  bool hasDPALU_DPP() const {
-    return HasDPALU_DPP;
-  }
-
-  bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
-
-  bool hasPackedFP32Ops() const {
-    return HasPackedFP32Ops;
+  bool hasDPPRowShare() const {
+    return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
   }
 
   // Has V_PK_MOV_B32 opcode
-  bool hasPkMovB32() const {
-    return GFX90AInsts;
-  }
+  bool hasPkMovB32() const { return HasGFX90AInsts; }
 
   bool hasFmaakFmamkF32Insts() const {
     return getGeneration() >= GFX10 || hasGFX940Insts();
@@ -1141,96 +488,26 @@ public:
 
   bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
 
-  bool hasImageInsts() const {
-    return HasImageInsts;
-  }
-
-  bool hasExtendedImageInsts() const {
-    return HasExtendedImageInsts;
-  }
-
-  bool hasR128A16() const {
-    return HasR128A16;
-  }
-
-  bool hasA16() const { return HasA16; }
-
-  bool hasG16() const { return HasG16; }
-
-  bool hasOffset3fBug() const {
-    return HasOffset3fBug;
-  }
-
-  bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
-
-  bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
-
-  bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
-
-  bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
-
-  bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
-
-  bool hasNSAEncoding() const { return HasNSAEncoding; }
-
   bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
 
-  bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
-
   unsigned getNSAMaxSize(bool HasSampler = false) const {
     return AMDGPU::getNSAMaxSize(*this, HasSampler);
   }
 
-  bool hasGFX10_AEncoding() const {
-    return GFX10_AEncoding;
-  }
-
-  bool hasGFX10_BEncoding() const {
-    return GFX10_BEncoding;
-  }
-
-  bool hasGFX10_3Insts() const {
-    return GFX10_3Insts;
-  }
-
   bool hasMadF16() const;
 
-  bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
-
-  bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+  bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
 
   // Scalar and global loads support scale_offset bit.
-  bool hasScaleOffset() const { return GFX1250Insts; }
-
-  bool hasFlatGVSMode() const { return FlatGVSMode; }
+  bool hasScaleOffset() const { return HasGFX1250Insts; }
 
   // FLAT GLOBAL VOffset is signed
-  bool hasSignedGVSOffset() const { return GFX1250Insts; }
+  bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
 
-  bool enableSIScheduler() const {
-    return EnableSIScheduler;
-  }
+  bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
 
-  bool loadStoreOptEnabled() const {
-    return EnableLoadStoreOpt;
-  }
-
-  bool hasSGPRInitBug() const {
-    return SGPRInitBug;
-  }
-
-  bool hasUserSGPRInit16Bug() const {
-    return UserSGPRInit16Bug && isWave32();
-  }
-
-  bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
-
-  bool hasNegativeUnalignedScratchOffsetBug() const {
-    return NegativeUnalignedScratchOffsetBug;
-  }
-
-  bool hasMFMAInlineLiteralBug() const {
-    return HasMFMAInlineLiteralBug;
+  bool hasUserSGPRInit16BugInWave32() const {
+    return HasUserSGPRInit16Bug && isWave32();
   }
 
   bool has12DWordStoreHazard() const {
@@ -1238,9 +515,7 @@ public:
   }
 
   // \returns true if the subtarget supports DWORDX3 load/store instructions.
-  bool hasDwordx3LoadStores() const {
-    return CIInsts;
-  }
+  bool hasDwordx3LoadStores() const { return HasCIInsts; }
 
   bool hasReadM0MovRelInterpHazard() const {
     return getGeneration() == AMDGPUSubtarget::GFX9;
@@ -1259,66 +534,32 @@ public:
     return getGeneration() == AMDGPUSubtarget::GFX9;
   }
 
-  bool hasVcmpxPermlaneHazard() const {
-    return HasVcmpxPermlaneHazard;
-  }
-
-  bool hasVMEMtoScalarWriteHazard() const {
-    return HasVMEMtoScalarWriteHazard;
-  }
-
-  bool hasSMEMtoVectorWriteHazard() const {
-    return HasSMEMtoVectorWriteHazard;
-  }
-
-  bool hasLDSMisalignedBug() const {
-    return LDSMisalignedBug && !EnableCuMode;
-  }
-
-  bool hasInstFwdPrefetchBug() const {
-    return HasInstFwdPrefetchBug;
-  }
-
-  bool hasVcmpxExecWARHazard() const {
-    return HasVcmpxExecWARHazard;
-  }
-
-  bool hasLdsBranchVmemWARHazard() const {
-    return HasLdsBranchVmemWARHazard;
+  bool hasLDSMisalignedBugInWGPMode() const {
+    return HasLDSMisalignedBug && !EnableCuMode;
   }
 
   // Shift amount of a 64 bit shift cannot be a highest allocated register
   // if also at the end of the allocation block.
   bool hasShift64HighRegBug() const {
-    return GFX90AInsts && !GFX940Insts;
+    return HasGFX90AInsts && !HasGFX940Insts;
   }
 
   // Has one cycle hazard on transcendental instruction feeding a
   // non transcendental VALU.
-  bool hasTransForwardingHazard() const { return GFX940Insts; }
+  bool hasTransForwardingHazard() const { return HasGFX940Insts; }
 
   // Has one cycle hazard on a VALU instruction partially writing dst with
   // a shift of result bits feeding another VALU instruction.
-  bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+  bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
 
   // Cannot use op_sel with v_dot instructions.
-  bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
+  bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
 
   // Does not have HW interlocs for VALU writing and then reading SGPRs.
-  bool hasVDecCoExecHazard() const {
-    return GFX940Insts;
-  }
-
-  bool hasNSAtoVMEMBug() const {
-    return HasNSAtoVMEMBug;
-  }
-
-  bool hasNSAClauseBug() const { return HasNSAClauseBug; }
+  bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
 
   bool hasHardClauses() const { return MaxHardClauseLength > 0; }
 
-  bool hasGFX90AInsts() const { return GFX90AInsts; }
-
   bool hasFPAtomicToDenormModeHazard() const {
     return getGeneration() == GFX10;
   }
@@ -1333,77 +574,45 @@ public:
     return getGeneration() == GFX11;
   }
 
-  bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
-
-  bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
+  bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
 
   bool requiresCodeObjectV6() const { return RequiresCOV6; }
 
   bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
 
-  bool hasGloballyAddressableScratch() const {
-    return HasGloballyAddressableScratch;
-  }
-
   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
 
-  bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
+  bool hasVALUReadSGPRHazard() const {
+    return HasGFX12Insts && !HasGFX1250Insts;
+  }
 
   bool setRegModeNeedsVNOPs() const {
-    return GFX1250Insts && getGeneration() == GFX12;
+    return HasGFX1250Insts && getGeneration() == GFX12;
   }
 
   /// Return if operations acting on VGPR tuples require even alignment.
   bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
 
   /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
-  bool hasSPackHL() const { return GFX11Insts; }
+  bool hasSPackHL() const { return HasGFX11Insts; }
 
   /// Return true if the target's EXP instruction has the COMPR flag, which
   /// affects the meaning of the EN (enable) bits.
-  bool hasCompressedExport() const { return !GFX11Insts; }
+  bool hasCompressedExport() const { return !HasGFX11Insts; }
 
   /// Return true if the target's EXP instruction supports the NULL export
   /// target.
-  bool hasNullExportTarget() const { return !GFX11Insts; }
-
-  bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
-
-  bool hasVOPDInsts() const { return HasVOPDInsts; }
+  bool hasNullExportTarget() const { return !HasGFX11Insts; }
 
   bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
 
   /// Return true if the target has the S_DELAY_ALU instruction.
-  bool hasDelayAlu() const { return GFX11Insts; }
-
-  bool hasPackedTID() const { return HasPackedTID; }
-
-  // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
-  // hasGFX90AInsts is also true.
-  bool hasGFX940Insts() const { return GFX940Insts; }
-
-  // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
-  // hasGFX940Insts and hasGFX90AInsts are also true.
-  bool hasGFX950Insts() const { return GFX950Insts; }
+  bool hasDelayAlu() const { return HasGFX11Insts; }
 
   /// Returns true if the target supports
   /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
   /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
-  bool hasLDSLoadB96_B128() const {
-    return hasGFX950Insts();
-  }
-
-  bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
-
-  bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
-
-  bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
-
-  bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
-
-  bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
-
-  bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
+  bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
 
   /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
   /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
@@ -1415,59 +624,25 @@ public:
     return getGeneration() == GFX12;
   }
 
-  /// \returns true if the target has instructions with xf32 format support.
-  bool hasXF32Insts() const { return HasXF32Insts; }
-
-  bool hasBitOp3Insts() const { return HasBitOp3Insts; }
-
-  bool hasPermlane16Swap() const { return HasPermlane16Swap; }
-  bool hasPermlane32Swap() const { return HasPermlane32Swap; }
-  bool hasAshrPkInsts() const { return HasAshrPkInsts; }
-
-  bool hasMinimum3Maximum3F32() const {
-    return HasMinimum3Maximum3F32;
-  }
-
-  bool hasMinimum3Maximum3F16() const {
-    return HasMinimum3Maximum3F16;
+  /// \returns true if the target has packed f32 instructions that only read 32
+  /// bits from a scalar operand (SGPR or literal) and replicates the bits to
+  /// both channels.
+  bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
+    return getGeneration() == GFX12 && HasGFX1250Insts;
   }
 
-  bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
-
-  bool hasTanhInsts() const { return HasTanhInsts; }
-
-  bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
-
-  bool hasAddPC64Inst() const { return GFX1250Insts; }
-
-  bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
-
-  bool hasMinimum3Maximum3PKF16() const {
-    return HasMinimum3Maximum3PKF16;
-  }
-
-  bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
-
-  /// \returns true if the target has s_wait_xcnt insertion. Supported for
-  /// GFX1250.
-  bool hasWaitXCnt() const { return HasWaitXcnt; }
+  bool hasAddPC64Inst() const { return HasGFX1250Insts; }
 
-  // A single DWORD instructions can use a 64-bit literal.
-  bool has64BitLiterals() const { return Has64BitLiterals; }
-
-  bool hasPointSampleAccel() const { return HasPointSampleAccel; }
-
-  bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
+  /// \returns true if the target supports expert scheduling mode 2 which relies
+  /// on the compiler to insert waits to avoid hazards between VMEM and VALU
+  /// instructions in some instances.
+  bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
 
   /// \returns The maximum number of instructions that can be enclosed in an
   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
   /// instruction.
   unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
 
-  bool hasPrngInst() const { return HasPrngInst; }
-
-  bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
-
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -1496,50 +671,22 @@ public:
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
-  /// \returns true if the flat_scratch register is initialized by the HW.
-  /// In this case it is readonly.
-  bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
-
-  /// \returns true if the architected SGPRs are enabled.
-  bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
-
-  /// \returns true if Global Data Share is supported.
-  bool hasGDS() const { return HasGDS; }
-
-  /// \returns true if Global Wave Sync is supported.
-  bool hasGWS() const { return HasGWS; }
-
   /// \returns true if the machine has merged shaders in which s0-s7 are
   /// reserved by the hardware and user SGPRs start at s8
-  bool hasMergedShaders() const {
-    return getGeneration() >= GFX9;
-  }
+  bool hasMergedShaders() const { return getGeneration() >= GFX9; }
 
   // \returns true if the target supports the pre-NGG legacy geometry path.
   bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
 
-  // \returns true if preloading kernel arguments is supported.
-  bool hasKernargPreload() const { return KernargPreload; }
-
   // \returns true if the target has split barriers feature
   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
 
-  // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
-  bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
-
-  // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
-  // no-return form.
-  bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
-
   // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
   bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
 
   // \returns true if the target has IEEE kernel descriptor mode bit
   bool hasIEEEMode() const { return getGeneration() < GFX12; }
 
-  // \returns true if the target has IEEE fminimum/fmaximum instructions
-  bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; }
-
   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
 
@@ -1547,52 +694,43 @@ public:
   /// values.
   bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
 
-  bool hasGFX1250Insts() const { return GFX1250Insts; }
+  bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
 
-  bool hasVOPD3() const { return GFX1250Insts; }
-
-  // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
-  bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
-
-  // \returns true if the target has V_MAD_U32 instruction.
-  bool hasMadU32Inst() const { return HasMadU32Inst; }
+  bool hasVOPD3() const { return HasGFX1250Insts; }
 
   // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
-  bool hasVectorMulU64() const { return GFX1250Insts; }
+  bool hasVectorMulU64() const { return HasGFX1250Insts; }
 
   // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
   // instructions.
-  bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+  bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
 
   // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
-  bool hasIntMinMax64() const { return GFX1250Insts; }
-
-  // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
-  bool hasAddMinMaxInsts() const { return GFX1250Insts; }
-
-  // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
-  bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+  bool hasIntMinMax64() const { return HasGFX1250Insts; }
 
   // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
-  bool hasPkMinMax3Insts() const { return GFX1250Insts; }
+  bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
 
   // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
-  bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
-
-  // \returns true if target has S_SETPRIO_INC_WG instruction.
-  bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
+  bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
 
   // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
   // of sign-extending. Note that GFX1250 has not only fixed the bug but also
   // extended VA to 57 bits.
-  bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
+  bool hasGetPCZeroExtension() const {
+    return HasGFX12Insts && !HasGFX1250Insts;
+  }
 
   // \returns true if the target needs to create a prolog for backward
   // compatibility when preloading kernel arguments.
   bool needsKernArgPreloadProlog() const {
-    return hasKernargPreload() && !GFX1250Insts;
+    return hasKernargPreload() && !HasGFX1250Insts;
   }
 
+  bool hasCondSubInsts() const { return HasGFX12Insts; }
+
+  bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1724,9 +862,7 @@ public:
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;
 
-  unsigned getMaxNumAGPRs(const Function &F) const {
-    return getMaxNumVGPRs(F);
-  }
+  unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
 
   /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
   /// of waves per execution unit required for the function \p MF.
@@ -1746,13 +882,9 @@ public:
 
   bool supportsWave64() const { return !hasGFX1250Insts(); }
 
-  bool isWave32() const {
-    return getWavefrontSize() == 32;
-  }
+  bool isWave32() const { return getWavefrontSize() == 32; }
 
-  bool isWave64() const {
-    return getWavefrontSize() == 64;
-  }
+  bool isWave64() const { return getWavefrontSize() == 64; }
 
   /// Returns if the wavesize of this subtarget is known reliable. This is false
   /// only for the a default target-cpu that does not have an explicit
@@ -1809,11 +941,11 @@ public:
 
   // \returns true if the subtarget has a hazard requiring an "s_nop 0"
   // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
-  bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
+  bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
 
   // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
   // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
-  bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
+  bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
 
   bool isDynamicVGPREnabled() const { return DynamicVGPR; }
   unsigned getDynamicVGPRBlockSize() const {
@@ -1835,15 +967,21 @@ public:
   // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
   // read.
   bool hasScratchBaseForwardingHazard() const {
-    return GFX1250Insts && getGeneration() == GFX12;
+    return HasGFX1250Insts && getGeneration() == GFX12;
   }
 
-  /// \returns true if the subtarget supports clusters of workgroups.
-  bool hasClusters() const { return HasClusters; }
+  // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
+  // result.
+  bool hasFlatScratchHiInB64InstHazard() const {
+    return HasGFX1250Insts && getGeneration() == GFX12;
+  }
 
-  /// \returns true if the subtarget requires a wait for xcnt before atomic
-  /// flat/global stores & rmw.
-  bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+  /// \returns true if the subtarget requires a wait for xcnt before VMEM
+  /// accesses that must never be repeated in the event of a page fault/re-try.
+  /// Atomic stores/rmw and all volatile accesses fall under this criteria.
+  bool requiresWaitXCntForSingleAccessInstructions() const {
+    return HasGFX1250Insts;
+  }
 
   /// \returns the number of significant bits in the immediate field of the
   /// S_NOP instruction.
@@ -1855,10 +993,28 @@ public:
     return 3;
   }
 
-  /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
-  /// num_records.
-  bool has45BitNumRecordsBufferResource() const {
-    return Has45BitNumRecordsBufferResource;
+  bool supportsBPermute() const {
+    return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  }
+
+  bool supportsWaveWideBPermute() const {
+    return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
+            getGeneration() == AMDGPUSubtarget::GFX12) ||
+           isWave32();
+  }
+
+  /// Return true if real (non-fake) variants of True16 instructions using
+  /// 16-bit registers should be code-generated. Fake True16 instructions are
+  /// identical to non-fake ones except that they take 32-bit registers as
+  /// operands and always use their low halves.
+  // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
+  // supported and the support for fake True16 instructions is removed.
+  bool useRealTrue16Insts() const {
+    return hasTrue16BitInsts() && EnableRealTrue16Insts;
+  }
+
+  bool requiresWaitOnWorkgroupReleaseFence() const {
+    return getGeneration() >= GFX10 || isTgSplitEnabled();
   }
 };
 
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 9e66909..663f538 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -35,18 +35,18 @@ using namespace llvm;
 #define DEBUG_TYPE "gcn-vopd-utils"
 
 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
-                                   const MachineInstr &FirstMI,
-                                   const MachineInstr &SecondMI, bool IsVOPD3) {
+                                   const MachineInstr &MIX,
+                                   const MachineInstr &MIY, bool IsVOPD3) {
   namespace VOPD = AMDGPU::VOPD;
 
-  const MachineFunction *MF = FirstMI.getMF();
+  const MachineFunction *MF = MIX.getMF();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && (TII.isVOP3(FirstMI) || TII.isVOP3(SecondMI)))
+  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
     return false;
-  if (TII.isDPP(FirstMI) || TII.isDPP(SecondMI))
+  if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;
 
   const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
@@ -61,32 +61,24 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
     UniqueLiterals.push_back(&Op);
   };
   SmallVector<Register> UniqueScalarRegs;
-  assert([&]() -> bool {
-    for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
-         MII != FirstMI.getParent()->instr_end(); ++MII) {
-      if (&*MII == &SecondMI)
-        return true;
-    }
-    return false;
-  }() && "Expected FirstMI to precede SecondMI");
-  // Cannot pair dependent instructions
-  for (const auto &Use : SecondMI.uses())
-    if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI))
+
+  // MIX must not modify any registers used by MIY.
+  for (const auto &Use : MIY.uses())
+    if (Use.isReg() && MIX.modifiesRegister(Use.getReg(), TRI))
       return false;
 
   auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
-    const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? FirstMI : SecondMI;
+    const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY;
     const MachineOperand &Operand = MI.getOperand(OperandIdx);
     if (Operand.isReg() && TRI->isVectorRegister(MRI, Operand.getReg()))
       return Operand.getReg();
     return Register();
   };
 
-  auto InstInfo =
-      AMDGPU::getVOPDInstInfo(FirstMI.getDesc(), SecondMI.getDesc());
+  auto InstInfo = AMDGPU::getVOPDInstInfo(MIX.getDesc(), MIY.getDesc());
 
   for (auto CompIdx : VOPD::COMPONENTS) {
-    const MachineInstr &MI = (CompIdx == VOPD::X) ? FirstMI : SecondMI;
+    const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY;
 
     const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0);
     if (Src0.isReg()) {
@@ -153,8 +145,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
   // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
   // source-cache.
   bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
-                 FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
-                 SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32;
+                 MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+                 MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
   bool AllowSameVGPR = ST.hasGFX1250Insts();
 
   if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR,
@@ -163,22 +155,23 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3) {
     // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero.
-    if (AMDGPU::hasNamedOperand(SecondMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+    // MIX check is only relevant to scheduling?
+    if (AMDGPU::hasNamedOperand(MIX.getOpcode(), AMDGPU::OpName::bitop3)) {
       const MachineOperand &Src2 =
-          *TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2);
+          *TII.getNamedOperand(MIX, AMDGPU::OpName::src2);
       if (!Src2.isImm() || Src2.getImm())
         return false;
     }
-    if (AMDGPU::hasNamedOperand(FirstMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+    if (AMDGPU::hasNamedOperand(MIY.getOpcode(), AMDGPU::OpName::bitop3)) {
       const MachineOperand &Src2 =
-          *TII.getNamedOperand(FirstMI, AMDGPU::OpName::src2);
+          *TII.getNamedOperand(MIY, AMDGPU::OpName::src2);
       if (!Src2.isImm() || Src2.getImm())
         return false;
     }
   }
 
-  LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
-                    << "\n\tY: " << SecondMI << "\n");
+  LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX
+                    << "\n\tY: " << MIY << "\n");
   return true;
 }
 
@@ -208,6 +201,15 @@ static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
           (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
       return false;
 
+    assert([&]() -> bool {
+      for (auto MII = MachineBasicBlock::const_iterator(FirstMI);
+           MII != FirstMI->getParent()->instr_end(); ++MII) {
+        if (&*MII == &SecondMI)
+          return true;
+      }
+      return false;
+    }() && "Expected FirstMI to precede SecondMI");
+
     return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3);
   };
 
diff --git a/llvm/lib/Target/AMDGPU/InstCombineTables.td b/llvm/lib/Target/AMDGPU/InstCombineTables.td
deleted file mode 100644
index 3d62641..0000000
--- a/llvm/lib/Target/AMDGPU/InstCombineTables.td
+++ /dev/null
@@ -1,10 +0,0 @@
-include "AMDGPU.td"
-
-def AMDGPUImageDMaskIntrinsicTable : GenericTable {
-  let FilterClass = "AMDGPUImageDMaskIntrinsic";
-  let Fields = ["Intr"];
-
-  let PrimaryKey = ["Intr"];
-  let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
-  let PrimaryKeyEarlyOut = 1;
-}
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index afaa190..9ec1213 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -322,13 +322,13 @@ bool AMDGPUCustomBehaviour::hasModifiersSet(
 }
 
 // taken from SIInstrInfo::isGWS()
-bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
+bool AMDGPUCustomBehaviour::isGWS(uint32_t Opcode) const {
   const MCInstrDesc &MCID = MCII.get(Opcode);
   return MCID.TSFlags & SIInstrFlags::GWS;
 }
 
 // taken from SIInstrInfo::isAlwaysGDS()
-bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
+bool AMDGPUCustomBehaviour::isAlwaysGDS(uint32_t Opcode) const {
   return Opcode == AMDGPU::DS_ORDERED_COUNT ||
          Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
          Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index cbc7427..aeb5c03 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -32,7 +32,7 @@ public:
   AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
-  ~AMDGPUInstrPostProcess() = default;
+  ~AMDGPUInstrPostProcess() override = default;
 
   void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
 };
@@ -68,9 +68,9 @@ class AMDGPUCustomBehaviour : public CustomBehaviour {
   bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
                        AMDGPU::OpName OpName) const;
   /// Helper function used in generateWaitCntInfo()
-  bool isGWS(uint16_t Opcode) const;
+  bool isGWS(uint32_t Opcode) const;
   /// Helper function used in generateWaitCntInfo()
-  bool isAlwaysGDS(uint16_t Opcode) const;
+  bool isAlwaysGDS(uint32_t Opcode) const;
   /// Helper function used in generateWaitCntInfo()
   bool isVMEM(const MCInstrDesc &MCID);
   /// This method gets called from checkCustomHazard when mca is attempting to
@@ -88,7 +88,7 @@ public:
   AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                         const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
 
-  ~AMDGPUCustomBehaviour() = default;
+  ~AMDGPUCustomBehaviour() override = default;
   /// This method is used to determine if an instruction
   /// should be allowed to be dispatched. The return value is
   /// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a..4aa4083 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -12,6 +12,7 @@
 #include "SIDefines.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -336,7 +337,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
 
 // \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
 // \p Reg itself otherwise.
-static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   if (Idx < 0x100)
@@ -355,10 +356,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
 }
 
 // Restore MSBs of a VGPR above 255 from the MCInstrAnalysis.
-static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo,
-                               const MCInstrDesc &Desc,
-                               const MCRegisterInfo &MRI,
-                               const AMDGPUMCInstrAnalysis &MIA) {
+static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo,
+                                const MCInstrDesc &Desc,
+                                const MCRegisterInfo &MRI,
+                                const AMDGPUMCInstrAnalysis &MIA) {
   unsigned VgprMSBs = MIA.getVgprMSBs();
   if (!VgprMSBs)
     return Reg;
@@ -403,10 +404,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
   }
 #endif
 
-  unsigned PrintReg = getRegForPrinting(Reg, MRI);
+  MCRegister PrintReg = getRegForPrinting(Reg, MRI);
   O << getRegisterName(PrintReg);
 
-  if (PrintReg != Reg.id())
+  if (PrintReg != Reg)
     O << " /*" << getRegisterName(Reg) << "*/";
 }
 
@@ -490,6 +491,18 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
   printRegularOperand(MI, OpNo, STI, O);
 }
 
+void AMDGPUInstPrinter::printAVLdSt32Align2RegOp(const MCInst *MI,
+                                                 unsigned OpNo,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  MCRegister Reg = MI->getOperand(OpNo).getReg();
+
+  // On targets with an even alignment requirement
+  if (MCRegister SubReg = MRI.getSubReg(Reg, AMDGPU::sub0))
+    Reg = SubReg;
+  printRegOperand(Reg, O, MRI);
+}
+
 void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
                                             const MCSubtargetInfo &STI,
                                             raw_ostream &O) {
@@ -610,6 +623,25 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
         printImmediateFP16(static_cast<uint16_t>(Imm), STI, O))
       return;
     break;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT: {
+    if (AMDGPU::isGFX11Plus(STI)) {
+      // For GFX11+, the inline constant is duplicated to both channels, so we
+      // need to check if the low and high 16 bits are the same, and then if
+      // they can be printed as inline constant values.
+      uint16_t Lo16 = static_cast<uint16_t>(Imm & 0xFFFF);
+      uint16_t Hi16 = static_cast<uint16_t>((Imm >> 16) & 0xFFFF);
+      if (Lo16 == Hi16 &&
+          printImmediateFP16(static_cast<uint16_t>(Imm), STI, O))
+        return;
+    } else {
+      // For pre-GFX11, the inline constant is in the low 16 bits, so we need
+      // to check if it can be printed as inline constant value.
+      if (isUInt<16>(Imm) &&
+          printImmediateFP16(static_cast<uint16_t>(Imm), STI, O))
+        return;
+    }
+    break;
+  }
   case AMDGPU::OPERAND_REG_IMM_V2BF16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
     if (isUInt<16>(Imm) &&
@@ -795,14 +827,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
     // Intention: print disassembler message when invalid code is decoded,
     // for example sgpr register used in VReg or VISrc(VReg or imm) operand.
     const MCOperandInfo &OpInfo = Desc.operands()[OpNo];
-    int16_t RCID = MII.getOpRegClassID(
-        OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
-    if (RCID != -1) {
+    if (OpInfo.RegClass != -1) {
+      int16_t RCID = MII.getOpRegClassID(
+          OpInfo, STI.getHwMode(MCSubtargetInfo::HwMode_RegInfo));
       const MCRegisterClass &RC = MRI.getRegClass(RCID);
       auto Reg = mc2PseudoReg(Op.getReg());
       if (!RC.contains(Reg) && !isInlineValue(Reg)) {
-        O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
-          << "\' register class*/";
+        bool IsWaveSizeOp = OpInfo.isLookupRegClassByHwMode() &&
+                            (OpInfo.RegClass == AMDGPU::SReg_1 ||
+                             OpInfo.RegClass == AMDGPU::SReg_1_XEXEC);
+        // Suppress this comment for a mismatched wavesize. Some users expect to
+        // be able to assemble and disassemble modules with mixed wavesizes, but
+        // we do not know the subtarget in different functions in MC.
+        //
+        // TODO: Should probably print it anyway, maybe a more specific version.
+        if (!IsWaveSizeOp) {
+          O << "/*Invalid register, operand has \'" << MRI.getRegClassName(&RC)
+            << "\' register class*/";
+        }
       }
     }
   } else if (Op.isImm()) {
@@ -844,6 +886,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
     case AMDGPU::OPERAND_REG_IMM_V2BF16:
     case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
     case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
@@ -1331,12 +1374,9 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
     return;
 
   O << Name;
-  for (int I = 0; I < NumOps; ++I) {
-    if (I != 0)
-      O << ',';
-
-    O << !!(Ops[I] & Mod);
-  }
+  ListSeparator Sep(",");
+  for (int I = 0; I < NumOps; ++I)
+    O << Sep << !!(Ops[I] & Mod);
 
   if (HasDstSel) {
     O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL);
@@ -1428,26 +1468,10 @@ void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo,
     return;
 
   O << " matrix_" << AorB << "_fmt:";
-  switch (Imm) {
-  default:
+  if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixFmt)))
+    O << WMMAMods::ModMatrixFmt[Imm];
+  else
     O << Imm;
-    break;
-  case WMMA::MatrixFMT::MATRIX_FMT_FP8:
-    O << "MATRIX_FMT_FP8";
-    break;
-  case WMMA::MatrixFMT::MATRIX_FMT_BF8:
-    O << "MATRIX_FMT_BF8";
-    break;
-  case WMMA::MatrixFMT::MATRIX_FMT_FP6:
-    O << "MATRIX_FMT_FP6";
-    break;
-  case WMMA::MatrixFMT::MATRIX_FMT_BF6:
-    O << "MATRIX_FMT_BF6";
-    break;
-  case WMMA::MatrixFMT::MATRIX_FMT_FP4:
-    O << "MATRIX_FMT_FP4";
-    break;
-  }
 }
 
 void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo,
@@ -1470,17 +1494,10 @@ void AMDGPUInstPrinter::printMatrixScale(const MCInst *MI, unsigned OpNo,
     return;
 
   O << " matrix_" << AorB << "_scale:";
-  switch (Imm) {
-  default:
+  if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixScale)))
+    O << WMMAMods::ModMatrixScale[Imm];
+  else
     O << Imm;
-    break;
-  case WMMA::MatrixScale::MATRIX_SCALE_ROW0:
-    O << "MATRIX_SCALE_ROW0";
-    break;
-  case WMMA::MatrixScale::MATRIX_SCALE_ROW1:
-    O << "MATRIX_SCALE_ROW1";
-    break;
-  }
 }
 
 void AMDGPUInstPrinter::printMatrixAScale(const MCInst *MI, unsigned OpNo,
@@ -1503,20 +1520,10 @@ void AMDGPUInstPrinter::printMatrixScaleFmt(const MCInst *MI, unsigned OpNo,
     return;
 
   O << " matrix_" << AorB << "_scale_fmt:";
-  switch (Imm) {
-  default:
+  if (Imm < static_cast<int64_t>(std::size(WMMAMods::ModMatrixScaleFmt)))
+    O << WMMAMods::ModMatrixScaleFmt[Imm];
+  else
     O << Imm;
-    break;
-  case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E8:
-    O << "MATRIX_SCALE_FMT_E8";
-    break;
-  case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E5M3:
-    O << "MATRIX_SCALE_FMT_E5M3";
-    break;
-  case WMMA::MatrixScaleFmt::MATRIX_SCALE_FMT_E4M3:
-    O << "MATRIX_SCALE_FMT_E4M3";
-    break;
-  }
 }
 
 void AMDGPUInstPrinter::printMatrixAScaleFmt(const MCInst *MI, unsigned OpNo,
@@ -1574,14 +1581,10 @@ void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo,
     O << formatHex(static_cast<uint64_t>(Val));
   } else {
     O << "gpr_idx(";
-    bool NeedComma = false;
+    ListSeparator Sep(",");
     for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
-      if (Val & (1 << ModeId)) {
-        if (NeedComma)
-          O << ',';
-        O << IdSymbolic[ModeId];
-        NeedComma = true;
-      }
+      if (Val & (1 << ModeId))
+        O << Sep << IdSymbolic[ModeId];
     }
     O << ')';
   }
@@ -1658,6 +1661,19 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printWaitEvent(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  using namespace llvm::AMDGPU::WaitEvent;
+  const uint16_t Imm16 = static_cast<uint16_t>(MI->getOperand(OpNo).getImm());
+
+  StringRef EventName = getWaitEventMaskName(Imm16, STI);
+  if (EventName.empty())
+    O << formatHex(static_cast<uint64_t>(Imm16));
+  else
+    O << EventName;
+}
+
 static void printSwizzleBitmask(const uint16_t AndMask,
                                 const uint16_t OrMask,
                                 const uint16_t XorMask,
@@ -1788,25 +1804,16 @@ void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo,
   bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA);
   bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt;
 
-  bool NeedSpace = false;
+  ListSeparator Sep(" ");
 
-  if (!IsDefaultVmcnt || PrintAll) {
-    O << "vmcnt(" << Vmcnt << ')';
-    NeedSpace = true;
-  }
+  if (!IsDefaultVmcnt || PrintAll)
+    O << Sep << "vmcnt(" << Vmcnt << ')';
 
-  if (!IsDefaultExpcnt || PrintAll) {
-    if (NeedSpace)
-      O << ' ';
-    O << "expcnt(" << Expcnt << ')';
-    NeedSpace = true;
-  }
+  if (!IsDefaultExpcnt || PrintAll)
+    O << Sep << "expcnt(" << Expcnt << ')';
 
-  if (!IsDefaultLgkmcnt || PrintAll) {
-    if (NeedSpace)
-      O << ' ';
-    O << "lgkmcnt(" << Lgkmcnt << ')';
-  }
+  if (!IsDefaultLgkmcnt || PrintAll)
+    O << Sep << "lgkmcnt(" << Lgkmcnt << ')';
 }
 
 void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
@@ -1822,14 +1829,10 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
     StringRef Name;
     unsigned Val;
     bool IsDefault;
-    bool NeedSpace = false;
+    ListSeparator Sep(" ");
     while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) {
-      if (!IsDefault || !HasNonDefaultVal) {
-        if (NeedSpace)
-          O << ' ';
-        O << Name << '(' << Val << ')';
-        NeedSpace = true;
-      }
+      if (!IsDefault || !HasNonDefaultVal)
+        O << Sep << Name << '(' << Val << ')';
     }
   } else {
     O << formatHex(Imm16);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index b27295e..5e9ebc6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -77,6 +77,9 @@ private:
                    raw_ostream &O);
   void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                       raw_ostream &O);
+  void printAVLdSt32Align2RegOp(const MCInst *MI, unsigned OpNo,
+                                const MCSubtargetInfo &STI, raw_ostream &O);
+
   void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI,
                            raw_ostream &O);
   void printImmediateBF16(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -232,6 +235,8 @@ protected:
                    raw_ostream &O);
   void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
+  void printWaitEvent(const MCInst *MI, unsigned OpNo,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printSWaitCnt(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index ea758bb..029d2ea 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -95,6 +95,13 @@ private:
   void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
                              APInt &Inst, APInt &Scratch,
                              const MCSubtargetInfo &STI) const;
+
+  template <bool HasSrc0, bool HasSrc1, bool HasSrc2>
+  APInt postEncodeVOP3(const MCInst &MI, APInt EncodedValue,
+                       const MCSubtargetInfo &STI) const;
+
+  APInt postEncodeVOPCX(const MCInst &MI, APInt EncodedValue,
+                        const MCSubtargetInfo &STI) const;
 };
 
 } // end anonymous namespace
@@ -343,6 +350,14 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding(
     return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
         .value_or(255);
 
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+    // V_PK_FMAC_F16 has different inline constant behavior on pre-GFX11 vs
+    // GFX11+: pre-GFX11 produces (f16, 0), GFX11+ duplicates f16 to both
+    // halves.
+    return AMDGPU::getPKFMACF16InlineEncoding(static_cast<uint32_t>(Imm),
+                                              AMDGPU::isGFX11Plus(STI))
+        .value_or(255);
+
   case AMDGPU::OPERAND_REG_IMM_V2BF16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
     return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm))
@@ -374,11 +389,6 @@ uint64_t AMDGPUMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
   return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
 }
 
-static bool isVCMPX64(const MCInstrDesc &Desc) {
-  return (Desc.TSFlags & SIInstrFlags::VOP3) &&
-         Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC);
-}
-
 void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
                                             SmallVectorImpl<char> &CB,
                                             SmallVectorImpl<MCFixup> &Fixups,
@@ -403,18 +413,6 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
     Encoding |= getImplicitOpSelHiEncoding(Opcode);
   }
 
-  // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
-  // Documentation requires dst to be encoded as EXEC (0x7E),
-  // but it looks like the actual value encoded for dst operand
-  // is ignored by HW. It was decided to define dst as "do not care"
-  // in td files to allow disassembler accept any dst value.
-  // However, dst is encoded as EXEC for compatibility with SP3.
-  if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
-    assert((Encoding & 0xFF) == 0);
-    Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
-                AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
-  }
-
   for (unsigned i = 0; i < bytes; i++) {
     CB.push_back((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
   }
@@ -733,4 +731,37 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
   llvm_unreachable("Encoding of this operand type is not supported yet.");
 }
 
+template <bool HasSrc0, bool HasSrc1, bool HasSrc2>
+APInt AMDGPUMCCodeEmitter::postEncodeVOP3(const MCInst &MI, APInt EncodedValue,
+                                          const MCSubtargetInfo &STI) const {
+  if (!AMDGPU::isGFX10Plus(STI))
+    return EncodedValue;
+  // Set unused source fields in VOP3 encodings to inline immediate 0 to avoid
+  // hardware conservatively assuming the instruction reads SGPRs.
+  constexpr uint64_t InlineImmediate0 = 0x80;
+  if (!HasSrc0)
+    EncodedValue |= InlineImmediate0 << 32;
+  if (!HasSrc1)
+    EncodedValue |= InlineImmediate0 << 41;
+  if (!HasSrc2)
+    EncodedValue |= InlineImmediate0 << 50;
+  return EncodedValue;
+}
+
+APInt AMDGPUMCCodeEmitter::postEncodeVOPCX(const MCInst &MI, APInt EncodedValue,
+                                           const MCSubtargetInfo &STI) const {
+  // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
+  // Documentation requires dst to be encoded as EXEC (0x7E),
+  // but it looks like the actual value encoded for dst operand
+  // is ignored by HW. It was decided to define dst as "do not care"
+  // in td files to allow disassembler accept any dst value.
+  // However, dst is encoded as EXEC for compatibility with SP3.
+  [[maybe_unused]] const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  assert((Desc.TSFlags & SIInstrFlags::VOP3) &&
+         Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC));
+  EncodedValue |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
+                  AMDGPU::HWEncoding::LO256_REG_IDX_MASK;
+  return postEncodeVOP3<true, true, false>(MI, EncodedValue, STI);
+}
+
 #include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index c27be02..63437779 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCExpr.h"
-#include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
   return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
 }
 
-/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
-///
-/// Remove dependency on GCNSubtarget and depend only only the necessary values
-/// for said occupancy computation. Should match computeOccupancy implementation
-/// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
-    unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
-    unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
-  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
-  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
-  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
-  unsigned Generation = STM.getGeneration();
-
-  auto CreateExpr = [&Ctx](unsigned Value) {
-    return MCConstantExpr::create(Value, Ctx);
-  };
-
-  return create(AGVK_Occupancy,
-                {CreateExpr(MaxWaves), CreateExpr(Granule),
-                 CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
-                 CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
-                Ctx);
-}
-
 const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
                                             MCContext &Ctx) {
   assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
@@ -481,7 +455,7 @@ static void unaryOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
     return;
   case MCUnaryExpr::Opcode::Minus: {
     KB.makeNegative();
-    KBM[Expr] = KB;
+    KBM[Expr] = std::move(KB);
     return;
   }
   case MCUnaryExpr::Opcode::Not: {
@@ -492,7 +466,7 @@ static void unaryOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
   }
   case MCUnaryExpr::Opcode::Plus: {
     KB.makeNonNegative();
-    KBM[Expr] = KB;
+    KBM[Expr] = std::move(KB);
     return;
   }
   }
@@ -514,7 +488,7 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
       knownBitsMapHelper(Arg, KBM, Depth + 1);
       KB |= KBM[Arg];
     }
-    KBM[Expr] = KB;
+    KBM[Expr] = std::move(KB);
     return;
   }
   case AMDGPUMCExpr::VariantKind::AGVK_Max: {
@@ -524,7 +498,7 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM,
       knownBitsMapHelper(Arg, KBM, Depth + 1);
       KB = KnownBits::umax(KB, KBM[Arg]);
     }
-    KBM[Expr] = KB;
+    KBM[Expr] = std::move(KB);
     return;
   }
   case AMDGPUMCExpr::VariantKind::AGVK_ExtraSGPRs:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 54fcd2a..bf7b40b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -64,7 +64,7 @@ private:
   ArrayRef<const MCExpr *> Args;
 
   AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
-  ~AMDGPUMCExpr();
+  ~AMDGPUMCExpr() override;
 
   bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const;
   bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
@@ -98,11 +98,6 @@ public:
     return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
   }
 
-  static const AMDGPUMCExpr *
-  createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
-                  const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
-                  const GCNSubtarget &STM, MCContext &Ctx);
-
   static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
                                        MCContext &Ctx);
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 013cfeb..28b4da8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -168,7 +168,7 @@ bool AMDGPUMCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
 
 void AMDGPUMCInstrAnalysis::updateState(const MCInst &Inst, uint64_t Addr) {
   if (Inst.getOpcode() == AMDGPU::S_SET_VGPR_MSB_gfx12)
-    VgprMSBs = Inst.getOperand(0).getImm();
+    VgprMSBs = Inst.getOperand(0).getImm() & 0xff;
   else if (isTerminator(Inst))
     VgprMSBs = 0;
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5a08573..86c5d1c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -114,10 +114,12 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: AK = GK_GFX1152; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153: AK = GK_GFX1153; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1170: AK = GK_GFX1170; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250: AK = GK_GFX1250; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1251: AK = GK_GFX1251; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1310: AK = GK_GFX1310; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:     AK = GK_GFX9_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC:   AK = GK_GFX9_4_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:  AK = GK_GFX10_1_GENERIC; break;
@@ -201,10 +203,12 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
   case GK_GFX1152: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152;
   case GK_GFX1153: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1153;
+  case GK_GFX1170: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1170;
   case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
   case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
   case GK_GFX1250: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1250;
   case GK_GFX1251: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1251;
+  case GK_GFX1310: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1310;
   case GK_GFX9_GENERIC:     return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
   case GK_GFX9_4_GENERIC:   return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC;
   case GK_GFX10_1_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC;
@@ -302,9 +306,9 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
 #undef PRINT_RES_INFO
 }
 
-void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
-                                                     const MCSymbol *MaxAGPR,
-                                                     const MCSymbol *MaxSGPR) {
+void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(
+    const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR,
+    const MCSymbol *MaxNamedBarrier) {
 #define PRINT_RES_INFO(ARG)                                                    \
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
@@ -315,6 +319,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
   PRINT_RES_INFO(MaxVGPR);
   PRINT_RES_INFO(MaxAGPR);
   PRINT_RES_INFO(MaxSGPR);
+  PRINT_RES_INFO(MaxNamedBarrier);
 #undef PRINT_RES_INFO
 }
 
@@ -398,7 +403,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
   EmitMCExpr(KD.kernarg_size);
   OS << '\n';
 
-  if (isGFX1250(STI)) {
+  if (isGFX1250Plus(STI)) {
     PrintField(KD.compute_pgm_rsrc2,
                amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT,
@@ -512,7 +517,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     OS << '\n';
   }
 
-  if (AMDGPU::isGFX1250(STI))
+  if (isGFX1250Plus(STI))
     PrintField(KD.compute_pgm_rsrc3,
                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 22afcde..3a0d8dc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -69,7 +69,8 @@ public:
 
   virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
                                       const MCSymbol *MaxAGPR,
-                                      const MCSymbol *MaxSGPR) {};
+                                      const MCSymbol *MaxSGPR,
+                                      const MCSymbol *MaxNamedBarrier) {};
 
   /// \returns True on success, false on failure.
   virtual bool EmitISAVersion() { return true; }
@@ -149,7 +150,8 @@ public:
       const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall) override;
 
   void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
-                              const MCSymbol *MaxSGPR) override;
+                              const MCSymbol *MaxSGPR,
+                              const MCSymbol *MaxNamedBarrier) override;
 
   /// \returns True on success, false on failure.
   bool EmitISAVersion() override;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 5f6d742..b023c96 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> {
 }
 
 class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc,
-                                RegisterClass addr_rc, string dns="">
-  : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
-  let Constraints = "$vdst = $vdata";
-
+                                RegisterClass addr_rc, bit noRtn, string dns="">
+  : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> {
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
-  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da";
 }
 
 class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc,
-                               RegisterClass addr_rc, string dns="">
-  : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> {
-  let Constraints = "$vdst = $vdata";
-
+                               RegisterClass addr_rc, bit noRtn, string dns="">
+  : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> {
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata,
                            addr_rc:$vaddr, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, LWE:$lwe, DA:$da);
-  let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
+  let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da";
 }
 
 class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc,
-                     RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc,
+                     RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn,
                              !if(enableDasm, "GFX6GFX7", "")> {
   let AssemblerPredicate = isGFX6GFX7;
 }
 
 class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc,
-                     RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> {
+                     RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> {
   let AssemblerPredicate = isGFX8GFX9NotGFX90A;
   let MIMGEncoding = MIMGEncGfx8;
 }
 
 class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc,
-                         RegisterClass addr_rc, bit enableDasm = 0>
-  : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> {
+                         RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0>
+  : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> {
   let AssemblerPredicate = isGFX90APlus;
   let MIMGEncoding = MIMGEncGfx90a;
 }
 
 class MIMG_Atomic_gfx10<mimgopc op, string opcode,
                         RegisterOperand DataRC, RegisterClass AddrRC,
-                        bit enableDisasm = 0>
-  : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
+                        bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)),
                !if(enableDisasm, "GFX10", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
-  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
 }
 
 class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
                             RegisterOperand DataRC, int num_addrs,
-                            bit enableDisasm = 0>
-  : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
+                            bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                    !if(enableDisasm, "GFX10", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
 
 class MIMG_Atomic_gfx11<mimgopc op, string opcode,
                         RegisterOperand DataRC, RegisterClass AddrRC,
-                        bit enableDisasm = 0>
-  : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
+                        bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)),
                !if(enableDisasm, "GFX11", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
                            R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe);
-  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
 }
 
 class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
                             RegisterOperand DataRC, int num_addrs,
-                            bit enableDisasm = 0>
-  : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
+                            bit noRtn = 0, bit enableDisasm = 0>
+  : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                    !if(enableDisasm, "GFX11", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$srsrc, DMask:$dmask,
@@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
 }
 
 class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC,
-                          int num_addrs, string renamed, bit enableDisasm = 0>
-  : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
+                          int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0>
+  : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs,
                   !if(enableDisasm, "GFX12", "")> {
-  let Constraints = "$vdst = $vdata";
-
+  let Constraints = !if(noRtn, "", "$vdst = $vdata");
+  let isCodeGenOnly = noRtn;
   let InOperandList = !con((ins DataRC:$vdata),
                            AddrIns,
                            (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim,
@@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
                                       RegisterOperand data_rc,
                                       bit enableDasm = 0,
                                       bit isFP = 0,
+                                      bit noRtn = 0,
                                       string renamed = ""> {
   let hasSideEffects = 1, // FIXME: remove this
       mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
-      FPAtomic = isFP in {
+      FPAtomic = isFP, IsAtomicNoRet = noRtn in {
     let VAddrDwords = 1 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_VI then {
-          def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
           let hasPostISelHook = 1 in
-          def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_GFX10M then {
-          def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
         if op.HAS_GFX11 then {
-          def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+          def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>;
+        def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>;
       }
     }
     let VAddrDwords = 2 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+          def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
-          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>;
+          def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
-          def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+          def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>;
         }
         if op.HAS_GFX11 then {
-          def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
-          def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+          def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>;
+          def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>;
+        def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>;
       }
     }
     let VAddrDwords = 3 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+          def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
-          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>;
+          def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
-          def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+          def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>;
         }
         if op.HAS_GFX11 then {
-          def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
-          def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+          def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>;
+          def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>;
+        def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>;
       }
     }
     let VAddrDwords = 4 in {
       let ssamp = 0 in {
         if op.HAS_SI then {
-          def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+          def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>;
         }
         if op.HAS_VI then {
-          def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
-          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>;
+          def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>;
         }
         if op.HAS_GFX10M then {
-          def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
-          def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+          def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>;
         }
         if op.HAS_GFX11 then {
-          def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
-          def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+          def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>;
+          def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>;
         }
       }
       if op.HAS_GFX12 then {
-        def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>;
+        def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>;
       }
     }
   }
@@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
     }
 }
 
-multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
-                        string renamed = ""> { // 64-bit atomics
-  let IsAtomicRet = 1 in {
+multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+                        bit noRtn = 0, string renamed = ""> { // 64-bit atomics
+  let IsAtomicRet = !not(noRtn) in {
     def "" : MIMGBaseOpcode {
       let Atomic = 1;
       let AtomicX2 = isCmpSwap;
+      let NoReturn = noRtn;
     }
 
     let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in {
@@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
       // Other variants are reconstructed by disassembler using dmask and tfe.
       if !not(isCmpSwap) then {
         let VDataDwords = 1 in
-        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>;
+        defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>;
       }
 
       let VDataDwords = 2 in
-      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>;
+      defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>;
       let VDataDwords = 3 in
-      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>;
+      defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>;
 
       if isCmpSwap then {
         let VDataDwords = 4 in
-        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>;
+        defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>;
         let VDataDwords = 5 in
-        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>;
+        defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>;
       }
     }
-  } // End IsAtomicRet = 1
+  }
+}
+
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+                        string renamed = ""> {
+  defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>;
+  defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>;
 }
 
 multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
@@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in {
 class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   Intrinsic Intr = I;
   MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod));
+  MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode;
   AMDGPUDimProps Dim = I.P.Dim;
   AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
 
@@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
 }
 
+class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I>
+  : ImageDimIntrinsicInfo<I> {
+  MIMGBaseOpcode AtomicNoRetBaseOpcode =
+    !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN"));
+}
+
 def ImageDimIntrinsicTable : GenericTable {
   let FilterClass = "ImageDimIntrinsicInfo";
-  let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
-    "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
-    "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
+  let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData",
+    "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex",
+    "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
     "BiasTyArg", "GradientTyArg", "CoordTyArg"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
+  string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode";
   string TypeOf_Dim = "MIMGDim";
 
   let PrimaryKey = ["Intr"];
@@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex {
   let Key = ["BaseOpcode", "Dim"];
 }
 
-foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
-                           AMDGPUImageDimAtomicIntrinsics) in {
+foreach intr = AMDGPUImageDimIntrinsics in {
   def : ImageDimIntrinsicInfo<intr>;
 }
 
+foreach intr = AMDGPUImageDimAtomicIntrinsics in {
+  def : ImageDimAtomicIntrinsicInfo<intr>;
+}
+
 // L to LZ Optimization Mapping
 def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>;
 def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>;
@@ -2057,12 +2076,12 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
   string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
 }
 
-let SubtargetPredicate = isGFX1250Plus in {
+let SubtargetPredicate = isGFX125xOnly in {
 def TENSOR_LOAD_TO_LDS    : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
 def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
 def TENSOR_LOAD_TO_LDS_D2    : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
 def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
-} // End SubtargetPredicate = isGFX1250Plus.
+} // End SubtargetPredicate = isGFX125xOnly.
 
 class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
   (node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
@@ -2074,12 +2093,12 @@ class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
   (inst $vaddr0, $vaddr1, 0, $cpol)
 >;
 
-let SubtargetPredicate = isGFX1250Plus in {
+let SubtargetPredicate = isGFX125xOnly in {
 def : TensorPat   <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
 def : TensorPat   <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
 def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
 def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
-}
+} // End SubtargetPredicate = isGFX125xOnly.
 
 class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
   InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
@@ -2097,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
   let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
   let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
 
+  // Set VADDR4 to NULL
+  let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+
   // set to 0 based on SPG.
-  let vaddr4 = 0;
   let rsrc = 0;
   let vdata = 0;
   let d16 = 0;
@@ -2109,7 +2130,7 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
 }
 
 multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
-  let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
+  let AssemblerPredicate = isGFX125xOnly, DecoderNamespace = "GFX1250" in {
     foreach DSuffix = ["_D2", ""] in {
       defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
       def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
diff --git a/llvm/lib/Target/AMDGPU/R600.td b/llvm/lib/Target/AMDGPU/R600.td
index 9148edb..bdfaac9 100644
--- a/llvm/lib/Target/AMDGPU/R600.td
+++ b/llvm/lib/Target/AMDGPU/R600.td
@@ -8,15 +8,6 @@
 
 include "llvm/Target/Target.td"
 
-def R600InstrInfo : InstrInfo {
-  let guessInstructionProperties = 1;
-}
-
-def R600 : Target {
-  let InstructionSet = R600InstrInfo;
-  let AllowRegisterRenaming = 1;
-}
-
 let Namespace = "R600" in {
 
 foreach Index = 0-15 in {
@@ -27,6 +18,18 @@ include "R600RegisterInfo.td"
 
 }
 
+defm : RemapAllTargetPseudoPointerOperands<R600_Addr>;
+
+def R600InstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+}
+
+def R600 : Target {
+  let InstructionSet = R600InstrInfo;
+  let AllowRegisterRenaming = 1;
+}
+
+
 def NullALU : InstrItinClass;
 def ALU_NULL : FuncUnit;
 
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 81b142e..248d734 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -68,7 +68,7 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
       getLoopDepth() > 1)
     return true;
 
-  if (!ST->hasCFAluBug())
+  if (!ST->hasCFALUBug())
     return false;
 
   switch(Opcode) {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c9..90c09fe 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "R600ISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUSelectionDAGInfo.h"
 #include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
@@ -29,7 +30,8 @@ using namespace llvm;
 
 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
                                        const R600Subtarget &STI)
-    : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) {
+    : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI),
+      Gen(STI.getGeneration()) {
   addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass);
   addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass);
   addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass);
@@ -1129,12 +1131,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
-      // TODO: can the chain be replaced without creating a new store?
-      SDValue NewStore = DAG.getTruncStore(
-          NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
-          StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
-          StoreNode->getAAInfo());
-      StoreNode = cast<StoreSDNode>(NewStore);
+      SmallVector<SDValue, 4> NewOps(StoreNode->ops());
+      NewOps[0] = NewChain;
+      StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps));
     }
 
     return scalarizeVectorStore(StoreNode, DAG);
@@ -1481,6 +1480,9 @@ SDValue R600TargetLowering::LowerFormalArguments(
       MemVT = MemVT.getVectorElementType();
     }
 
+    if (VT.isInteger() && !MemVT.isInteger())
+      MemVT = MemVT.changeTypeToInteger();
+
     if (AMDGPU::isShader(CallConv)) {
       Register Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
@@ -1497,11 +1499,15 @@ SDValue R600TargetLowering::LowerFormalArguments(
     // thread group and global sizes.
     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
-      // FIXME: This should really check the extload type, but the handling of
-      // extload vector parameters seems to be broken.
+      if (VT.isFloatingPoint()) {
+        Ext = ISD::EXTLOAD;
+      } else {
+        // FIXME: This should really check the extload type, but the handling of
+        // extload vector parameters seems to be broken.
 
-      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
-      Ext = ISD::SEXTLOAD;
+        // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+        Ext = ISD::SEXTLOAD;
+      }
     }
 
     // Compute the offset from the value.
@@ -2179,18 +2185,20 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
 }
 
 TargetLowering::AtomicExpansionKind
-R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+R600TargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
   switch (RMW->getOperation()) {
   case AtomicRMWInst::Nand:
   case AtomicRMWInst::FAdd:
   case AtomicRMWInst::FSub:
   case AtomicRMWInst::FMax:
   case AtomicRMWInst::FMin:
+  case AtomicRMWInst::USubCond:
+  case AtomicRMWInst::USubSat:
     return AtomicExpansionKind::CmpXChg;
   case AtomicRMWInst::UIncWrap:
   case AtomicRMWInst::UDecWrap:
     // FIXME: Cayman at least appears to have instructions for this, but the
-    // instruction defintions appear to be missing.
+    // instruction definitions appear to be missing.
     return AtomicExpansionKind::CmpXChg;
   case AtomicRMWInst::Xchg: {
     const DataLayout &DL = RMW->getFunction()->getDataLayout();
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index fc361c01..661efb8 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -116,7 +116,7 @@ private:
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 
   TargetLowering::AtomicExpansionKind
-  shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override;
+  shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const override;
 };
 
 } // End namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 3e256cc..7f805e6 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 #include "R600GenInstrInfo.inc"
 
 R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
-    : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {}
+    : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
@@ -176,7 +176,7 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
-  const MachineFunction *MF = MI.getParent()->getParent();
+  const MachineFunction *MF = MI.getMF();
   return !AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
          usesVertexCache(MI.getOpcode());
 }
@@ -186,7 +186,7 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
-  const MachineFunction *MF = MI.getParent()->getParent();
+  const MachineFunction *MF = MI.getMF();
   return (AMDGPU::isCompute(MF->getFunction().getCallingConv()) &&
           usesVertexCache(MI.getOpcode())) ||
           usesTextureCache(MI.getOpcode());
@@ -948,7 +948,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
         .setReg(Pred[2].getReg());
     MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W))
         .setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+    MachineInstrBuilder MIB(*MI.getMF(), MI);
     MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
@@ -956,7 +956,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
   if (PIdx != -1) {
     MachineOperand &PMO = MI.getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
-    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+    MachineInstrBuilder MIB(*MI.getMF(), MI);
     MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit);
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index 68bbac1..b96c17e 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -326,7 +326,7 @@ public:
 
 namespace R600 {
 
-int getLDSNoRetOp(uint16_t Opcode);
+int64_t getLDSNoRetOp(uint32_t Opcode);
 
 } //End namespace AMDGPU
 
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index dda0cf6..6d7cc8b 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -736,22 +736,22 @@ def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>;
 // XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
 def SETE : R600_2OP <
   0x08, "SETE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OEQ))]
 >;
 
 def SGT : R600_2OP <
   0x09, "SETGT",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OGT))]
 >;
 
 def SGE : R600_2OP <
   0xA, "SETGE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_OGE))]
 >;
 
 def SNE : R600_2OP <
   0xB, "SETNE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, fpimm_one, fpimm_zero, COND_UNE_NE))]
 >;
 
 def SETE_DX10 : R600_2OP <
@@ -1004,19 +1004,19 @@ class FMA_Common <bits<5> inst> : R600_3OP <
 
 class CNDE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
+  [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OEQ))]
 >;
 
 class CNDGT_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGT",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+  [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OGT))]
 > {
   let Itinerary = VecALU;
 }
 
 class CNDGE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+  [(set f32:$dst, (selectcc f32:$src0, fpimm_zero, f32:$src1, f32:$src2, COND_OGE))]
 > {
   let Itinerary = VecALU;
 }
diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
index 48b4e7f..ac6508c 100644
--- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@@ -55,7 +55,7 @@ void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   StringRef Err;
   if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
-    LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext();
+    LLVMContext &C = MI->getMF()->getFunction().getContext();
     C.emitError("Illegal instruction detected: " + Err);
     MI->print(errs());
   }
diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
index d9902e1..56d1a19 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -464,7 +464,7 @@ void R600MachineCFGStructurizer::insertCondBranchBefore(
   MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
   MBB->insert(I, NewMI);
   MachineInstrBuilder MIB(*MF, NewMI);
-  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  MIB.addReg(OldMI->getOperand(1).getReg());
   SHOWNEWINSTR(NewMI);
   //erase later oldInstr->eraseFromParent();
 }
@@ -476,7 +476,7 @@ void R600MachineCFGStructurizer::insertCondBranchBefore(
   MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
   //insert before
   blk->insert(I, NewInstr);
-  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum);
   SHOWNEWINSTR(NewInstr);
 }
 
@@ -1401,7 +1401,7 @@ void R600MachineCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingM
                     << LandMBB->getNumber() << "\n";);
   MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
   assert(BranchMI && isCondBranch(BranchMI));
-  DebugLoc DL = BranchMI->getDebugLoc();
+  const DebugLoc &DL = BranchMI->getDebugLoc();
   MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
   MachineBasicBlock::iterator I = BranchMI;
   if (TrueBranch != LandMBB)
@@ -1427,7 +1427,7 @@ void R600MachineCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingM
     MachineBasicBlock::iterator I = MI;
     MachineBasicBlock *TrueBranch = getTrueBranch(MI);
     int OldOpcode = MI->getOpcode();
-    DebugLoc DL = MI->getDebugLoc();
+    const DebugLoc &DL = MI->getDebugLoc();
 
     bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
 
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 7f75f27..9e1a97e 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -181,7 +181,7 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   Register Reg = RSI->Instr->getOperand(0).getReg();
   MachineBasicBlock::iterator Pos = RSI->Instr;
   MachineBasicBlock &MBB = *Pos->getParent();
-  DebugLoc DL = Pos->getDebugLoc();
+  const DebugLoc &DL = Pos->getDebugLoc();
 
   Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
   DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
@@ -222,8 +222,8 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
 
   // Update RSI
   RSI->Instr = NewMI;
-  RSI->RegToChan = UpdatedRegToChan;
-  RSI->UndefReg = UpdatedUndef;
+  RSI->RegToChan = std::move(UpdatedRegToChan);
+  RSI->UndefReg = std::move(UpdatedUndef);
 
   return NewMI;
 }
diff --git a/llvm/lib/Target/AMDGPU/R600Processors.td b/llvm/lib/Target/AMDGPU/R600Processors.td
index 0265a97..dc21eb9 100644
--- a/llvm/lib/Target/AMDGPU/R600Processors.td
+++ b/llvm/lib/Target/AMDGPU/R600Processors.td
@@ -14,7 +14,7 @@ class SubtargetFeatureFetchLimit <string Value> :
 >;
 
 def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-  "R600ALUInst",
+  "HasR600ALUInst",
   "false",
   "Older version of ALU instructions encoding"
 >;
@@ -29,37 +29,43 @@ def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
 >;
 
 def FeatureCaymanISA : SubtargetFeature<"caymanISA",
-  "CaymanISA",
+  "HasCaymanISA",
   "true",
   "Use Cayman ISA"
 >;
 
 def FeatureCFALUBug : SubtargetFeature<"cfalubug",
-  "CFALUBug",
+  "HasCFALUBug",
   "true",
   "GPU has CF_ALU bug"
 >;
 
+def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
+  "HasMadMacF32Insts",
+  "true",
+  "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions"
+>;
+
 class R600SubtargetFeatureGeneration <string Value, string FeatureName,
                                   list<SubtargetFeature> Implies> :
         SubtargetFeatureGeneration <Value, FeatureName, "R600Subtarget", Implies>;
 
 def FeatureR600 : R600SubtargetFeatureGeneration<"R600", "r600",
-  [FeatureR600ALUInst, FeatureFetchLimit8]
+  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureMadMacF32Insts]
 >;
 
 def FeatureR700 : R600SubtargetFeatureGeneration<"R700", "r700",
-  [FeatureFetchLimit16]
+  [FeatureFetchLimit16, FeatureMadMacF32Insts]
 >;
 
 def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", "evergreen",
-  [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768]
+  [FeatureFetchLimit16, FeatureAddressableLocalMemorySize32768, FeatureMadMacF32Insts]
 >;
 
 def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
   "northern-islands",
   [FeatureFetchLimit16, FeatureWavefrontSize64,
-   FeatureAddressableLocalMemorySize32768]
+   FeatureAddressableLocalMemorySize32768, FeatureMadMacF32Insts]
 >;
 
 
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h
index 22e56b6..71398ce 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.h
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -27,15 +27,14 @@ namespace llvm {
 
 class R600Subtarget final : public R600GenSubtargetInfo,
                             public AMDGPUSubtarget {
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
+  bool ATTRIBUTE = DEFAULT;
+#include "R600GenSubtargetInfo.inc"
+
 private:
   R600InstrInfo InstrInfo;
   R600FrameLowering FrameLowering;
-  bool FMA = false;
-  bool CaymanISA = false;
-  bool CFALUBug = false;
-  bool HasVertexCache = false;
-  bool R600ALUInst = false;
-  bool FP64 = false;
   short TexVTXClauseSize = 0;
   Generation Gen = R600;
   R600TargetLowering TLInfo;
@@ -102,9 +101,7 @@ public:
     return (getGeneration() >= EVERGREEN);
   }
 
-  bool hasCaymanISA() const {
-    return CaymanISA;
-  }
+  bool hasCaymanISA() const { return HasCaymanISA; }
 
   bool hasFFBL() const {
     return (getGeneration() >= EVERGREEN);
@@ -114,9 +111,15 @@ public:
     return (getGeneration() >= EVERGREEN);
   }
 
-  bool hasFMA() const { return FMA; }
+  bool hasFMA() const override { return HasFMA; }
+
+  bool hasMadMacF32Insts() const override { return HasMadMacF32Insts; }
+
+  bool enablePromoteAlloca() const override { return EnablePromoteAlloca; }
+
+  bool hasFP64() const override { return HasFP64; }
 
-  bool hasCFAluBug() const { return CFALUBug; }
+  bool hasCFALUBug() const { return HasCFALUBug; }
 
   bool hasVertexCache() const { return HasVertexCache; }
 
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index c20487e..4771967 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -57,9 +57,9 @@ public:
   R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts,
                          PassInstrumentationCallbacks *PIC);
 
-  void addPreISel(AddIRPass &addPass) const;
-  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
-  Error addInstSelector(AddMachinePass &) const;
+  void addPreISel(PassManagerWrapper &PMW) const;
+  void addAsmPrinter(PassManagerWrapper &PMW, CreateMCStreamer) const;
+  Error addInstSelector(PassManagerWrapper &PMW) const;
 };
 
 //===----------------------------------------------------------------------===//
@@ -188,16 +188,16 @@ R600CodeGenPassBuilder::R600CodeGenPassBuilder(
   Opt.RequiresCodeGenSCCOrder = true;
 }
 
-void R600CodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+void R600CodeGenPassBuilder::addPreISel(PassManagerWrapper &PMW) const {
   // TODO: Add passes pre instruction selection.
 }
 
-void R600CodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
+void R600CodeGenPassBuilder::addAsmPrinter(PassManagerWrapper &PMW,
                                            CreateMCStreamer) const {
   // TODO: Add AsmPrinter.
 }
 
-Error R600CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
+Error R600CodeGenPassBuilder::addInstSelector(PassManagerWrapper &PMW) const {
   // TODO: Add instruction selector.
   return Error::success();
 }
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index 3093227..c08edc1 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -108,19 +108,17 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
   }
 }
 
-InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                                TTI::TargetCostKind CostKind,
-                                                unsigned Index,
-                                                const Value *Op0,
-                                                const Value *Op1) const {
+InstructionCost R600TTIImpl::getVectorInstrCost(
+    unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
+    const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
   switch (Opcode) {
   case Instruction::ExtractElement:
   case Instruction::InsertElement: {
     unsigned EltSize =
         DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
     if (EltSize < 32) {
-      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
-                                       Op1);
+      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+                                       VIC);
     }
 
     // Extracts are just reads of a subregister, so are free. Inserts are
@@ -131,7 +129,8 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     return Index == ~0u ? 2 : 0;
   }
   default:
-    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
+    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
+                                     VIC);
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
index 3deae69..ade1b15 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -62,10 +62,11 @@ public:
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr) const override;
   using BaseT::getVectorInstrCost;
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
-                                     const Value *Op1) const override;
+  InstructionCost
+  getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind,
+                     unsigned Index, const Value *Op0, const Value *Op1,
+                     TTI::VectorInstrContext VIC =
+                         TTI::VectorInstrContext::None) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index ecc2824..0c7c642 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -46,6 +46,7 @@ enum {
   GFX11 = 10,
   GFX12 = 11,
   GFX1250 = 12,
+  GFX13 = 13,
 };
 }
 
@@ -207,6 +208,7 @@ enum OperandType : unsigned {
   OPERAND_REG_IMM_FP16,
   OPERAND_REG_IMM_V2BF16,
   OPERAND_REG_IMM_V2FP16,
+  OPERAND_REG_IMM_V2FP16_SPLAT,
   OPERAND_REG_IMM_V2INT16,
   OPERAND_REG_IMM_NOINLINE_V2FP16,
   OPERAND_REG_IMM_V2INT32,
@@ -423,6 +425,9 @@ enum CPol {
   // Volatile (used to preserve/signal operation volatility for buffer
   // operations not a real instruction bit)
   VOLATILE = 1 << 31,
+  // The set of "cache policy" bits used for compiler features that
+  // do not correspond to handware features.
+  VIRTUAL_BITS = VOLATILE,
 };
 
 } // namespace CPol
@@ -445,7 +450,6 @@ enum Id { // Message ID, width(4) [3:0].
   ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
   ID_GS_ALLOC_REQ = 9,       // added in GFX9
   ID_GET_DOORBELL = 10,      // added in GFX9, removed in GFX11
-  ID_SAVEWAVE_HAS_TDM = 10,  // added in GFX1250
   ID_GET_DDID = 11,          // added in GFX10, removed in GFX11
   ID_SYSMSG = 15,
 
@@ -459,6 +463,7 @@ enum Id { // Message ID, width(4) [3:0].
   ID_RTN_GET_SE_AID_ID = 135,
 
   ID_RTN_GET_CLUSTER_BARRIER_STATE = 136, // added in GFX1250
+  ID_RTN_SAVE_WAVE_HAS_TDM = 152,         // added in GFX1250
 
   ID_MASK_PreGFX11_ = 0xF,
   ID_MASK_GFX11Plus_ = 0xFF
@@ -496,6 +501,14 @@ enum StreamId : unsigned { // Stream ID, (2) [9:8].
 
 } // namespace SendMsg
 
+namespace WaitEvent { // Encoding of SIMM16 used in s_wait_event
+enum Id {
+  DONT_WAIT_EXPORT_READY = 1 << 0, // Only used in gfx11
+  EXPORT_READY = 1 << 1,           // gfx12+
+};
+
+} // namespace WaitEvent
+
 namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
 
 enum Id { // HwRegCode, (6) [5:0]
@@ -520,6 +533,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_HW_ID1 = 23,
   ID_HW_ID2 = 24,
   ID_POPS_PACKER = 25,
+  ID_SCHED_MODE = 26,
   ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
   ID_IB_STS2 = 28,
   ID_SHADER_CYCLES = 29,
@@ -578,11 +592,11 @@ enum ModeRegisterMasks : uint32_t {
   CSP_MASK = 0x7u << 29, // Bits 29..31
 
   // GFX1250
-  DST_VGPR_MSB = 1 << 12,
-  SRC0_VGPR_MSB = 1 << 13,
-  SRC1_VGPR_MSB = 1 << 14,
-  SRC2_VGPR_MSB = 1 << 15,
-  VGPR_MSB_MASK = 0xf << 12, // Bits 12..15
+  DST_VGPR_MSB = 0x3 << 12,
+  SRC0_VGPR_MSB = 0x3 << 14,
+  SRC1_VGPR_MSB = 0x3 << 16,
+  SRC2_VGPR_MSB = 0x3 << 18,
+  VGPR_MSB_MASK = 0xff << 12, // Bits 12..19
 
   REPLAY_MODE = 1 << 25,
   FLAT_SCRATCH_IS_NV = 1 << 26,
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 7793907..8782fc5 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -111,7 +111,7 @@ public:
   V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
       : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){};
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() {
+  void dump() const {
     dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
            << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
            << "\nScore: " << Score << "\n";
@@ -238,7 +238,7 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
                                       const SIRegisterInfo *TRI,
                                       const SIInstrInfo *TII) {
-  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   auto &Src = MI.getOperand(1);
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = Src.getReg();
@@ -856,8 +856,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
     }
   }
 
-  if (TRI->isVectorRegister(*MRI, PHIRes) ||
-       RC0 == &AMDGPU::VReg_1RegClass) {
+  if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) ||
+      RC0 == &AMDGPU::VReg_1RegClass) {
     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
     TII->legalizeOperands(MI, MDT);
   }
@@ -902,14 +902,28 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
     // really much we can do to fix this.
     // Some special instructions use M0 as an input. Some even only use
     // the first lane. Insert a readfirstlane and hope for the best.
-    if (DstReg == AMDGPU::M0 &&
-        TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) {
+    const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
+    if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
       Register TmpReg =
           MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
-              TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
+
+      const MCInstrDesc &ReadFirstLaneDesc =
+          TII->get(AMDGPU::V_READFIRSTLANE_B32);
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), ReadFirstLaneDesc, TmpReg)
           .add(MI.getOperand(1));
+
+      unsigned SubReg = MI.getOperand(1).getSubReg();
       MI.getOperand(1).setReg(TmpReg);
+      MI.getOperand(1).setSubReg(AMDGPU::NoSubRegister);
+
+      const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+      const TargetRegisterClass *ConstrainRC =
+          SubReg == AMDGPU::NoSubRegister
+              ? OpRC
+              : TRI->getMatchingSuperRegClass(SrcRC, OpRC, SubReg);
+
+      if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+        llvm_unreachable("failed to constrain register");
     } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(),
                                       MI, MI.getDebugLoc())) {
       I = std::next(I);
@@ -930,7 +944,7 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
   // s_mov_b32.
   if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
     MI.getOperand(1).ChangeToImmediate(Imm);
-    MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+    MI.addImplicitDefUseOperands(*MI.getMF());
     MI.setDesc(TII->get(SMovOp));
     return true;
   }
@@ -999,7 +1013,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
       AnalysisWorklist.push_back(U);
     }
   }
-  V2SCopies[Info.ID] = Info;
+  V2SCopies[Info.ID] = std::move(Info);
 }
 
 // The main function that computes the VGPR to SGPR copy score
@@ -1058,7 +1072,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
     unsigned CurID = LoweringWorklist.pop_back_val();
     auto *CurInfoIt = V2SCopies.find(CurID);
     if (CurInfoIt != V2SCopies.end()) {
-      V2SCopyInfo C = CurInfoIt->second;
+      const V2SCopyInfo &C = CurInfoIt->second;
       LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
       for (auto S : C.Siblings) {
         auto *SibInfoIt = V2SCopies.find(S);
@@ -1075,10 +1089,10 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
       }
       LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
                         << " is being turned to VALU\n");
+      Copies.insert(C.Copy);
       // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if
       // instead.
       V2SCopies.erase(C.ID);
-      Copies.insert(C.Copy);
     }
   }
 
@@ -1115,16 +1129,27 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
       Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
-          .addReg(SrcReg, 0, SubReg)
+          .addReg(SrcReg, {}, SubReg)
           .addImm(AMDGPU::lo16)
           .addReg(Undef)
           .addImm(AMDGPU::hi16);
       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
           .addReg(VReg32);
     } else if (SrcSize == 32) {
-      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                         TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
-      MIB.addReg(SrcReg, 0, SubReg);
+      const MCInstrDesc &ReadFirstLaneDesc =
+          TII->get(AMDGPU::V_READFIRSTLANE_B32);
+      const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)
+          .addReg(SrcReg, {}, SubReg);
+
+      const TargetRegisterClass *ConstrainRC =
+          SubReg == AMDGPU::NoSubRegister
+              ? OpRC
+              : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,
+                                              SubReg);
+
+      if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+        llvm_unreachable("failed to constrain register");
     } else {
       auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
                             TII->get(AMDGPU::REG_SEQUENCE), DstReg);
diff --git a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index d0d6792..b368e20 100644
--- a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -27,9 +27,7 @@ class SIFixVGPRCopiesLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SIFixVGPRCopiesLegacy() : MachineFunctionPass(ID) {
-    initializeSIFixVGPRCopiesLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  SIFixVGPRCopiesLegacy() : MachineFunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 6616b30..a2fe31b 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -187,7 +187,7 @@ public:
   unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
     switch (Opc) {
     case AMDGPU::S_ADD_I32: {
-      if (ST->hasAddNoCarry())
+      if (ST->hasAddNoCarryInsts())
         return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
       return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
     }
@@ -242,7 +242,6 @@ public:
                    SmallVectorImpl<FoldCandidate> &FoldList,
                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
 
-  std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
   bool tryConstantFoldOp(MachineInstr *MI) const;
   bool tryFoldCndMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
@@ -681,6 +680,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
         return false;
       MI->setDesc(TII->get(NewMFMAOpc));
       MI->untieRegOperand(0);
+      const MCInstrDesc &MCID = MI->getDesc();
+      for (unsigned I = 0; I < MI->getNumDefs(); ++I)
+        if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1)
+          MI->getOperand(I).setIsEarlyClobber(true);
     }
 
     // TODO: Should we try to avoid adding this to the candidate list?
@@ -709,7 +712,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
 
   // Verify the register is compatible with the operand.
   if (const TargetRegisterClass *OpRC =
-          TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
+          TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
     const TargetRegisterClass *NewRC =
         TRI->getRegClassForReg(*MRI, New->getReg());
 
@@ -762,6 +765,29 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
                       FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
 }
 
+// Returns true if the instruction is a packed F32 instruction and the
+// corresponding scalar operand reads 32 bits and replicates the bits to both
+// channels.
+static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(
+    const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
+  if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
+    return false;
+  const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
+  return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
+}
+
+// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
+// literal) and replicates the bits to both channels. Therefore, if the hi and
+// lo are not same, we can't fold it.
+static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(
+    const FoldableDef &OpToFold) {
+  assert(OpToFold.isImm() && "Expected immediate operand");
+  uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
+  uint32_t Lo = Lo_32(ImmVal);
+  uint32_t Hi = Hi_32(ImmVal);
+  return Lo == Hi;
+}
+
 bool SIFoldOperandsImpl::tryAddToFoldList(
     SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
     const FoldableDef &OpToFold) const {
@@ -915,6 +941,13 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
       return true;
   }
 
+  // Special case for PK_F32 instructions if we are trying to fold an imm to
+  // src0 or src1.
+  if (OpToFold.isImm() &&
+      isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) &&
+      !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
+    return false;
+
   appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
   return true;
 }
@@ -1129,40 +1162,14 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return false;
 
-  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
+    if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) &&
+        !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
+      return false;
     appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
     return true;
   }
 
-  // TODO: Verify the following code handles subregisters correctly.
-  // TODO: Handle extract of global reference
-  if (UseOp.getSubReg())
-    return false;
-
-  if (!OpToFold.isReg())
-    return false;
-
-  Register UseReg = OpToFold.getReg();
-  if (!UseReg.isVirtual())
-    return false;
-
-  // Maybe it is just a COPY of an immediate itself.
-
-  // FIXME: Remove this handling. There is already special case folding of
-  // immediate into copy in foldOperand. This is looking for the def of the
-  // value the folding started from in the first place.
-  MachineInstr *Def = MRI->getVRegDef(UseReg);
-  if (Def && TII->isFoldableCopy(*Def)) {
-    MachineOperand &DefOp = Def->getOperand(1);
-    if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
-      FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
-                              OpToFold.DefSubReg);
-      appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
-      return true;
-    }
-  }
-
   return false;
 }
 
@@ -1309,10 +1316,11 @@ void SIFoldOperandsImpl::foldOperand(
         continue;
 
       const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
-      const TargetRegisterClass *MovSrcRC =
-          TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx]));
 
-      if (MovSrcRC) {
+      int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
+      if (RegClassID != -1) {
+        const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
+
         if (UseSubReg)
           MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
 
@@ -1351,7 +1359,7 @@ void SIFoldOperandsImpl::foldOperand(
       if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
         const auto &SrcOp = UseMI->getOperand(UseOpIdx);
         MachineOperand NewSrcOp(SrcOp);
-        MachineFunction *MF = UseMI->getParent()->getParent();
+        MachineFunction *MF = UseMI->getMF();
         UseMI->removeOperand(1);
         UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
         UseMI->addOperand(NewSrcOp);                          // src0
@@ -1382,7 +1390,7 @@ void SIFoldOperandsImpl::foldOperand(
       // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
       // VS_16RegClass
       //
-      // Excerpt from AMDGPUGenRegisterInfo.inc
+      // Excerpt from AMDGPUGenRegisterInfoEnums.inc
       // NoSubRegister, //0
       // hi16, // 1
       // lo16, // 2
@@ -1437,6 +1445,7 @@ void SIFoldOperandsImpl::foldOperand(
           return;
 
         UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
+        UseMI->clearFlag(MachineInstr::NoConvergent);
 
         if (OpToFold.isImm()) {
           UseMI->getOperand(1).ChangeToImmediate(
@@ -1468,6 +1477,7 @@ void SIFoldOperandsImpl::foldOperand(
         UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
         UseMI->getOperand(1).setIsKill(false);
         UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
+        UseMI->clearFlag(MachineInstr::NoConvergent);
         return;
       }
     }
@@ -1558,38 +1568,6 @@ static unsigned getMovOpc(bool IsScalar) {
   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
 }
 
-static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
-  MI.setDesc(NewDesc);
-
-  // Remove any leftover implicit operands from mutating the instruction. e.g.
-  // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
-  // anymore.
-  const MCInstrDesc &Desc = MI.getDesc();
-  unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
-                    Desc.implicit_defs().size();
-
-  for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
-    MI.removeOperand(I);
-}
-
-std::optional<int64_t>
-SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
-  if (Op.isImm())
-    return Op.getImm();
-
-  if (!Op.isReg() || !Op.getReg().isVirtual())
-    return std::nullopt;
-
-  const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
-  if (Def && Def->isMoveImmediate()) {
-    const MachineOperand &ImmSrc = Def->getOperand(1);
-    if (ImmSrc.isImm())
-      return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
-  }
-
-  return std::nullopt;
-}
-
 // Try to simplify operations with a constant that may appear after instruction
 // selection.
 // TODO: See if a frame index with a fixed offset can fold.
@@ -1604,13 +1582,14 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
     return false;
 
   MachineOperand *Src0 = &MI->getOperand(Src0Idx);
-  std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
+  std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
 
   if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
        Opc == AMDGPU::S_NOT_B32) &&
       Src0Imm) {
     MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
-    mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+    TII->mutateAndCleanupImplicit(
+        *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
     return true;
   }
 
@@ -1619,7 +1598,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
     return false;
 
   MachineOperand *Src1 = &MI->getOperand(Src1Idx);
-  std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
+  std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
 
   if (!Src0Imm && !Src1Imm)
     return false;
@@ -1638,7 +1617,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
     // instruction.
     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
     MI->removeOperand(Src1Idx);
-    mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
+    TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
     return true;
   }
 
@@ -1658,11 +1637,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
     if (Src1Val == 0) {
       // y = or x, 0 => y = copy x
       MI->removeOperand(Src1Idx);
-      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+      TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
     } else if (Src1Val == -1) {
       // y = or x, -1 => y = v_mov_b32 -1
       MI->removeOperand(Src1Idx);
-      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
+      TII->mutateAndCleanupImplicit(
+          *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
     } else
       return false;
 
@@ -1674,11 +1654,12 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
     if (Src1Val == 0) {
       // y = and x, 0 => y = v_mov_b32 0
       MI->removeOperand(Src0Idx);
-      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
+      TII->mutateAndCleanupImplicit(
+          *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
     } else if (Src1Val == -1) {
       // y = and x, -1 => y = copy x
       MI->removeOperand(Src1Idx);
-      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+      TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
     } else
       return false;
 
@@ -1690,7 +1671,7 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
     if (Src1Val == 0) {
       // y = xor x, 0 => y = copy x
       MI->removeOperand(Src1Idx);
-      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+      TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
       return true;
     }
   }
@@ -1708,11 +1689,11 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
   if (!Src1->isIdenticalTo(*Src0)) {
-    std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
+    std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
     if (!Src1Imm)
       return false;
 
-    std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
+    std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
     if (!Src0Imm || *Src0Imm != *Src1Imm)
       return false;
   }
@@ -1736,7 +1717,7 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
     MI.removeOperand(Src1ModIdx);
   if (Src0ModIdx != -1)
     MI.removeOperand(Src0ModIdx);
-  mutateCopyOp(MI, NewDesc);
+  TII->mutateAndCleanupImplicit(MI, NewDesc);
   LLVM_DEBUG(dbgs() << MI);
   return true;
 }
@@ -1746,7 +1727,8 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
       MI.getOpcode() != AMDGPU::V_AND_B32_e32)
     return false;
 
-  std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
+  std::optional<int64_t> Src0Imm =
+      TII->getImmOrMaterializedImm(MI.getOperand(1));
   if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
     return false;
 
@@ -1804,7 +1786,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
   if (CopiesToReplace.empty() && FoldList.empty())
     return Changed;
 
-  MachineFunction *MF = MI.getParent()->getParent();
+  MachineFunction *MF = MI.getMF();
   // Make sure we add EXEC uses to any new v_mov instructions created.
   for (MachineInstr *Copy : CopiesToReplace)
     Copy->addImplicitDefUseOperands(*MF);
@@ -2419,7 +2401,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
 
   unsigned OpIdx = Op - &UseMI->getOperand(0);
   const MCInstrDesc &InstDesc = UseMI->getDesc();
-  const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
+  const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
   if (!OpRC || !TRI->isVectorSuperClass(OpRC))
     return false;
 
@@ -2435,7 +2417,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
     } else { // This is a copy
       MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
       SubDef->getOperand(1).setIsKill(false);
-      RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
+      RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
     }
     RS.addImm(SubIdx);
   }
@@ -2759,7 +2741,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
     MachineInstr *VGPRCopy =
         BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
                 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
-            .addReg(Reg, /* flags */ 0, SubReg);
+            .addReg(Reg, /* flags */ {}, SubReg);
 
     // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
     Register TempAGPR = MRI->createVirtualRegister(ARC);
@@ -2793,7 +2775,6 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
   //
   // FIXME: Also need to check strictfp
   bool IsIEEEMode = MFI->getMode().IEEE;
-  bool HasNSZ = MFI->hasNoSignedZerosFPMath();
 
   bool Changed = false;
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
@@ -2832,8 +2813,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
 
       // TODO: Omod might be OK if there is NSZ only on the source
       // instruction, and not the omod multiply.
-      if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
-          !tryFoldOMod(MI))
+      if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI))
         Changed |= tryFoldClamp(MI);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 6b13b06..9820341 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -33,7 +33,7 @@ MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
 namespace {
 
 class SIFormMemoryClausesImpl {
-  using RegUse = DenseMap<unsigned, std::pair<unsigned, LaneBitmask>>;
+  using RegUse = DenseMap<unsigned, std::pair<RegState, LaneBitmask>>;
 
   bool canBundle(const MachineInstr &MI, const RegUse &Defs,
                  const RegUse &Uses) const;
@@ -61,9 +61,7 @@ class SIFormMemoryClausesLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) {
-    initializeSIFormMemoryClausesLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -132,8 +130,8 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
   return true;
 }
 
-static unsigned getMopState(const MachineOperand &MO) {
-  unsigned S = 0;
+static RegState getMopState(const MachineOperand &MO) {
+  RegState S = {};
   if (MO.isImplicit())
     S |= RegState::Implicit;
   if (MO.isDead())
@@ -234,7 +232,7 @@ void SIFormMemoryClausesImpl::collectRegUses(const MachineInstr &MI,
                            : LaneBitmask::getAll();
     RegUse &Map = MO.isDef() ? Defs : Uses;
 
-    unsigned State = getMopState(MO);
+    RegState State = getMopState(MO);
     auto [Loc, Inserted] = Map.try_emplace(Reg, State, Mask);
     if (!Inserted) {
       Loc->second.first |= State;
@@ -349,7 +347,7 @@ bool SIFormMemoryClausesImpl::run(MachineFunction &MF) {
           continue;
 
         // Collect the register operands we should extend the live ranges of.
-        SmallVector<std::tuple<unsigned, unsigned>> KillOps;
+        SmallVector<std::tuple<RegState, unsigned>> KillOps;
         const LiveInterval &LI = LIS->getInterval(R.first);
 
         if (!LI.hasSubRanges()) {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0189e7b..a0952b3 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -139,8 +139,8 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
                              MachineBasicBlock::iterator I, const DebugLoc &DL,
                              Register SpillReg, int FI, Register FrameReg,
                              int64_t DwordOff = 0) {
-  unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
-                                        : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+  unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                                            : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
 
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -163,8 +163,8 @@ static void buildEpilogRestore(const GCNSubtarget &ST,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL, Register SpillReg, int FI,
                                Register FrameReg, int64_t DwordOff = 0) {
-  unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
-                                        : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+  unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+                                            : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
 
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -591,7 +591,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
 }
 
 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
-  return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
+  return ST.hasFlatScratchEnabled() ? 1 : ST.getWavefrontSize();
 }
 
 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
@@ -629,7 +629,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // This will return `Register()` in cases where there are no actual
   // uses of the SRSRC.
   Register ScratchRsrcReg;
-  if (!ST.enableFlatScratch())
+  if (!ST.hasFlatScratchEnabled())
     ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
 
   // Make the selected register live throughout the function.
@@ -724,14 +724,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
         FrameInfo.getMaxAlign());
     MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
 
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
-        .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
-            AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
-    // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
-    // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
-    // SCC, so we need to check for 0 manually.
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg);
     if (requiresStackPointerReference(MF)) {
       Register SPReg = MFI->getStackPtrOffsetReg();
       assert(SPReg != AMDGPU::SP_REG);
@@ -755,10 +748,10 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   bool NeedsFlatScratchInit =
       MFI->getUserSGPRInfo().hasFlatScratchInit() &&
       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
-       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+       (!allStackObjectsAreDead(FrameInfo) && ST.hasFlatScratchEnabled()));
 
   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
-      PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
+      PreloadedScratchWaveOffsetReg && !ST.hasArchitectedFlatScratch()) {
     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
@@ -772,6 +765,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                          PreloadedScratchRsrcReg,
                                          ScratchRsrcReg, ScratchWaveOffsetReg);
   }
+
+  if (ST.hasWaitXcnt()) {
+    // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
+    // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
+    // insertion logic, which assumes multi-group mode by default.
+    unsigned RegEncoding =
+        AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+        .addImm(1)
+        .addImm(RegEncoding);
+  }
 }
 
 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
@@ -1034,16 +1038,13 @@ void SIFrameLowering::emitCSRSpillStores(
 
   StoreWWMRegisters(WWMCalleeSavedRegs);
   if (FuncInfo->isWholeWaveFunction()) {
-    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
-    // it now. If we have already saved some WWM CSR registers, then the EXEC is
-    // already -1 and we don't need to do anything else. Otherwise, set EXEC to
-    // -1 here.
+    // If we have already saved some WWM CSR registers, then the EXEC is already
+    // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
     if (!ScratchExecCopy)
       buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
                            /*EnableInactiveLanes*/ true);
     else if (WWMCalleeSavedRegs.empty())
       EnableAllLanes();
-    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
   } else if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
@@ -1340,6 +1341,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
          "Needed to save BP but didn't save it anywhere");
 
   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
+
+  if (FuncInfo->isWholeWaveFunction()) {
+    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
+    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+  }
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -1831,9 +1837,7 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
 
 static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
                                        const GCNSubtarget &ST,
-                                       std::vector<CalleeSavedInfo> &CSI,
-                                       unsigned &MinCSFrameIndex,
-                                       unsigned &MaxCSFrameIndex) {
+                                       std::vector<CalleeSavedInfo> &CSI) {
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1902,10 +1906,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
     int FrameIdx =
         MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass),
                               /*isSpillSlot=*/true);
-    if ((unsigned)FrameIdx < MinCSFrameIndex)
-      MinCSFrameIndex = FrameIdx;
-    if ((unsigned)FrameIdx > MaxCSFrameIndex)
-      MaxCSFrameIndex = FrameIdx;
+    MFI.setIsCalleeSavedObjectIndex(FrameIdx, true);
 
     CSIt->setFrameIdx(FrameIdx);
     CSIt->setReg(RegBlock);
@@ -1915,8 +1916,7 @@ static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
 
 bool SIFrameLowering::assignCalleeSavedSpillSlots(
     MachineFunction &MF, const TargetRegisterInfo *TRI,
-    std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
-    unsigned &MaxCSFrameIndex) const {
+    std::vector<CalleeSavedInfo> &CSI) const {
   if (CSI.empty())
     return true; // Early exit if no callee saved registers are modified!
 
@@ -1924,12 +1924,12 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
   bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
 
   if (UseVGPRBlocks)
-    assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
+    assignSlotsUsingVGPRBlocks(MF, ST, CSI);
 
-  return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks;
+  return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks;
 }
 
-bool SIFrameLowering::assignCalleeSavedSpillSlots(
+bool SIFrameLowering::assignCalleeSavedSpillSlotsImpl(
     MachineFunction &MF, const TargetRegisterInfo *TRI,
     std::vector<CalleeSavedInfo> &CSI) const {
   if (CSI.empty())
@@ -1986,7 +1986,7 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
   // TODO: We could try sorting the objects to find a hole in the first bytes
   // rather than allocating as close to possible. This could save a lot of space
   // on frames with alignment requirements.
-  if (ST.enableFlatScratch()) {
+  if (ST.hasFlatScratchEnabled()) {
     if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
                                SIInstrFlags::FlatScratch))
       return false;
@@ -2168,7 +2168,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
     return MFI.getStackSize() != 0;
   }
 
-  return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
+  return (frameTriviallyRequiresSP(MFI) &&
+          !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
+         MFI.isFrameAddressTaken() ||
          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
              MF) ||
          mayReserveScratchForCWSR(MF) ||
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index a727729..4c1cf3c 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -49,11 +49,9 @@ public:
                               const TargetRegisterInfo *TRI,
                               std::vector<CalleeSavedInfo> &CSI) const override;
 
-  bool assignCalleeSavedSpillSlots(MachineFunction &MF,
-                                   const TargetRegisterInfo *TRI,
-                                   std::vector<CalleeSavedInfo> &CSI,
-                                   unsigned &MinCSFrameIndex,
-                                   unsigned &MaxCSFrameIndex) const override;
+  bool assignCalleeSavedSpillSlotsImpl(MachineFunction &MF,
+                                       const TargetRegisterInfo *TRI,
+                                       std::vector<CalleeSavedInfo> &CSI) const;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2841c11..fe1d24f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPULaneMaskUtils.h"
+#include "AMDGPUSelectionDAGInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -34,6 +35,8 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/PseudoSourceValueManager.h"
 #include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
@@ -86,69 +89,78 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
 
 SITargetLowering::SITargetLowering(const TargetMachine &TM,
                                    const GCNSubtarget &STI)
-    : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
+    : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+
+  const SIRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetRegisterClass *V32RegClass =
+      TRI->getDefaultVectorSuperClassForBitWidth(32);
+  addRegisterClass(MVT::f32, V32RegClass);
 
   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
 
-  const SIRegisterInfo *TRI = STI.getRegisterInfo();
-  const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+  const TargetRegisterClass *V64RegClass =
+      TRI->getDefaultVectorSuperClassForBitWidth(64);
 
   addRegisterClass(MVT::f64, V64RegClass);
   addRegisterClass(MVT::v2f32, V64RegClass);
   addRegisterClass(MVT::Untyped, V64RegClass);
 
   addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
-  addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+  addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
 
   addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
   addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
 
   addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
-  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
 
   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
-  addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+  addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
 
   addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
-  addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
+  addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
 
   addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
-  addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
+  addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
 
   addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
-  addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
+  addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
 
   addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
-  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+  addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
 
   addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
-  addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+  addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
 
   addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
-  addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
+  addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
 
   addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
-  addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
+  addRegisterClass(MVT::v10f32,
+                   TRI->getDefaultVectorSuperClassForBitWidth(320));
 
   addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
-  addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
+  addRegisterClass(MVT::v11f32,
+                   TRI->getDefaultVectorSuperClassForBitWidth(352));
 
   addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
-  addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
+  addRegisterClass(MVT::v12f32,
+                   TRI->getDefaultVectorSuperClassForBitWidth(384));
 
   addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
-  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+  addRegisterClass(MVT::v16f32,
+                   TRI->getDefaultVectorSuperClassForBitWidth(512));
 
   addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
-  addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+  addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
 
   addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
-  addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+  addRegisterClass(MVT::v16f64,
+                   TRI->getDefaultVectorSuperClassForBitWidth(1024));
 
   if (Subtarget->has16BitInsts()) {
     if (Subtarget->useRealTrue16Insts()) {
@@ -180,7 +192,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
-  addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+  addRegisterClass(MVT::v32f32,
+                   TRI->getDefaultVectorSuperClassForBitWidth(1024));
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
@@ -217,9 +230,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
           ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT,
           ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE,
           ISD::SETCC}) {
-      // FIXME: The promoted to type shouldn't need to be explicit
       setOperationAction(Opc, MVT::bf16, Promote);
-      AddPromotedToType(Opc, MVT::bf16, MVT::f32);
     }
 
     setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
@@ -263,6 +274,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
 
   setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
+  setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
 
   setOperationAction(ISD::SELECT, MVT::i1, Promote);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
@@ -298,7 +310,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC,
                      {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
 
-  setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
+  setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
 
   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
 
@@ -492,6 +504,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->has16BitInsts()) {
     setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
     setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
+    setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
+    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+    setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal);
   } else {
     setOperationAction(ISD::FSQRT, MVT::f16, Custom);
   }
@@ -499,21 +514,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasMadMacF32Insts())
     setOperationAction(ISD::FMAD, MVT::f32, Legal);
 
-  if (!Subtarget->hasBFI())
-    // fcopysign can be done in a single instruction with BFI.
-    setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
-
-  if (!Subtarget->hasBCNT(32))
-    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-
-  if (!Subtarget->hasBCNT(64))
-    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
-  if (Subtarget->hasFFBH())
-    setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
-
-  if (Subtarget->hasFFBL())
-    setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
+  setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
+  setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
 
   // We only really have 32-bit BFE instructions (and 16-bit on VI).
   //
@@ -523,14 +525,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
   // span the midpoint are probably relatively rare, so don't worry about them
   // for now.
-  if (Subtarget->hasBFE())
-    setHasExtractBitsInsn(true);
+  setHasExtractBitsInsn(true);
 
   // Clamp modifier on add/sub
   if (Subtarget->hasIntClamp())
     setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
 
-  if (Subtarget->hasAddNoCarry())
+  if (Subtarget->hasAddNoCarryInsts())
     setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
                        Legal);
 
@@ -562,6 +563,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
   setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
 
+  setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i32,
+                     Custom);
+  setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i16,
+                     Custom);
+  setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i1,
+                     Custom);
+
   // Custom lower these because we can't specify a rule based on an illegal
   // source bf16.
   setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
@@ -623,8 +631,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     if (Subtarget->hasBF16TransInsts())
       setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
 
-    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
-    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
+    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+                        ISD::FP_TO_UINT_SAT},
+                       MVT::f16, Promote);
+    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+                        ISD::FP_TO_UINT_SAT},
+                       MVT::bf16, Promote);
 
     // F16 - VOP2 Actions.
     setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
@@ -657,6 +669,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
           break;
         case ISD::EXTRACT_SUBVECTOR:
         case ISD::CONCAT_VECTORS:
+        case ISD::FSIN:
+        case ISD::FCOS:
           setOperationAction(Op, VT, Custom);
           break;
         default:
@@ -1016,6 +1030,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::SCALAR_TO_VECTOR,
                        ISD::ZERO_EXTEND,
                        ISD::SIGN_EXTEND_INREG,
+                       ISD::ANY_EXTEND,
                        ISD::EXTRACT_VECTOR_ELT,
                        ISD::INSERT_VECTOR_ELT,
                        ISD::FCOPYSIGN});
@@ -1047,6 +1062,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::ATOMIC_LOAD_FMAX,
                        ISD::ATOMIC_LOAD_UINC_WRAP,
                        ISD::ATOMIC_LOAD_UDEC_WRAP,
+                       ISD::ATOMIC_LOAD_USUB_COND,
+                       ISD::ATOMIC_LOAD_USUB_SAT,
                        ISD::INTRINSIC_VOID,
                        ISD::INTRINSIC_W_CHAIN});
 
@@ -1109,12 +1126,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
     EVT ScalarVT = VT.getScalarType();
     unsigned Size = ScalarVT.getSizeInBits();
     if (Size == 16) {
-      if (Subtarget->has16BitInsts()) {
-        if (VT.isInteger())
-          return MVT::v2i16;
-        return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
-      }
-      return VT.isInteger() ? MVT::i32 : MVT::f32;
+      return Subtarget->has16BitInsts()
+                 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
+                 : MVT::i32;
     }
 
     if (Size < 16)
@@ -1122,6 +1136,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
     return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
   }
 
+  if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
+    return MVT::i32;
+
   if (VT.getSizeInBits() > 32)
     return MVT::i32;
 
@@ -1140,7 +1157,7 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
     unsigned Size = ScalarVT.getSizeInBits();
 
     // FIXME: Should probably promote 8-bit vectors to i16.
-    if (Size == 16 && Subtarget->has16BitInsts())
+    if (Size == 16)
       return (NumElts + 1) / 2;
 
     if (Size <= 32)
@@ -1164,16 +1181,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     // FIXME: We should fix the ABI to be the same on targets without 16-bit
     // support, but unless we can properly handle 3-vectors, it will be still be
     // inconsistent.
-    if (Size == 16 && Subtarget->has16BitInsts()) {
-      if (ScalarVT == MVT::bf16) {
-        RegisterVT = MVT::i32;
-        IntermediateVT = MVT::v2bf16;
-      } else {
-        RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
-        IntermediateVT = RegisterVT;
-      }
+    if (Size == 16) {
+      MVT SimpleIntermediateVT =
+          MVT::getVectorVT(ScalarVT.getSimpleVT(), ElementCount::getFixed(2));
+      IntermediateVT = SimpleIntermediateVT;
+      RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
       NumIntermediates = (NumElts + 1) / 2;
-      return NumIntermediates;
+      return (NumElts + 1) / 2;
     }
 
     if (Size == 32) {
@@ -1279,57 +1293,61 @@ static unsigned getIntrMemWidth(unsigned IntrID) {
   case Intrinsic::amdgcn_global_store_async_from_lds_b32:
   case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
   case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_flat_load_monitor_b32:
+  case Intrinsic::amdgcn_global_load_monitor_b32:
     return 32;
   case Intrinsic::amdgcn_global_load_async_to_lds_b64:
   case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
   case Intrinsic::amdgcn_global_store_async_from_lds_b64:
   case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
   case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_flat_load_monitor_b64:
+  case Intrinsic::amdgcn_global_load_monitor_b64:
     return 64;
   case Intrinsic::amdgcn_global_load_async_to_lds_b128:
   case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
   case Intrinsic::amdgcn_global_store_async_from_lds_b128:
   case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
   case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
+  case Intrinsic::amdgcn_flat_load_monitor_b128:
+  case Intrinsic::amdgcn_global_load_monitor_b128:
     return 128;
   default:
     llvm_unreachable("Unknown width");
   }
 }
 
-static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
-                                      TargetLoweringBase::IntrinsicInfo &Info) {
-  Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
+static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI,
+                                                 unsigned ArgIdx) {
+  Value *OrderingArg = CI.getArgOperand(ArgIdx);
   unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
   switch (AtomicOrderingCABI(Ord)) {
   case AtomicOrderingCABI::acquire:
-    Info.order = AtomicOrdering::Acquire;
+    return AtomicOrdering::Acquire;
     break;
   case AtomicOrderingCABI::release:
-    Info.order = AtomicOrdering::Release;
+    return AtomicOrdering::Release;
     break;
   case AtomicOrderingCABI::seq_cst:
-    Info.order = AtomicOrdering::SequentiallyConsistent;
+    return AtomicOrdering::SequentiallyConsistent;
     break;
   default:
-    Info.order = AtomicOrdering::Monotonic;
-    break;
+    return AtomicOrdering::Monotonic;
   }
+}
 
-  Info.flags =
-      (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
-  Info.flags |= MOCooperative;
-
+static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
   MDNode *ScopeMD = cast<MDNode>(
-      cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
+      cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
   StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
-  Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
+  return CI.getContext().getOrInsertSyncScopeID(Scope);
 }
 
-bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                          const CallInst &CI,
+void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
+                                          const CallBase &CI,
                                           MachineFunction &MF,
                                           unsigned IntrID) const {
+  IntrinsicInfo Info;
   Info.flags = MachineMemOperand::MONone;
   if (CI.hasMetadata(LLVMContext::MD_invariant_load))
     Info.flags |= MachineMemOperand::MOInvariant;
@@ -1343,7 +1361,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
         Intrinsic::getFnAttributes(CI.getContext(), (Intrinsic::ID)IntrID);
     MemoryEffects ME = Attr.getMemoryEffects();
     if (ME.doesNotAccessMemory())
-      return false;
+      return;
 
     // TODO: Should images get their own address space?
     Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
@@ -1433,13 +1451,35 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
         }
         break;
       case Intrinsic::amdgcn_raw_buffer_load_lds:
+      case Intrinsic::amdgcn_raw_buffer_load_async_lds:
       case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+      case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
       case Intrinsic::amdgcn_struct_buffer_load_lds:
-      case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+      case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+      case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+      case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
         unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+
+        // Entry 0: Load from buffer.
+        // Don't set an offset, since the pointer value always represents the
+        // base of the buffer.
         Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-        Info.ptrVal = CI.getArgOperand(1);
-        return true;
+        Info.flags &= ~MachineMemOperand::MOStore;
+        Infos.push_back(Info);
+
+        // Entry 1: Store to LDS.
+        // Instruction offset is applied, and an additional per-lane offset
+        // which we simulate using a larger memory type.
+        Info.memVT = EVT::getIntegerVT(
+            CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
+        Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+        Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
+                          ->getZExtValue();
+        Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
+        Info.flags &= ~MachineMemOperand::MOLoad;
+        Info.flags |= MachineMemOperand::MOStore;
+        Infos.push_back(Info);
+        return;
       }
       case Intrinsic::amdgcn_raw_atomic_buffer_load:
       case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
@@ -1449,11 +1489,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
             memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
                                     std::numeric_limits<unsigned>::max());
         Info.flags &= ~MachineMemOperand::MOStore;
-        return true;
+        Infos.push_back(Info);
+        return;
       }
       }
     }
-    return true;
+    Infos.push_back(Info);
+    return;
   }
 
   switch (IntrID) {
@@ -1469,7 +1511,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     if (!Vol->isZero())
       Info.flags |= MachineMemOperand::MOVolatile;
 
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
   case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
@@ -1478,7 +1521,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = nullptr;
     Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume: {
@@ -1492,7 +1536,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     if (!Vol->isZero())
       Info.flags |= MachineMemOperand::MOVolatile;
 
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
   case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
@@ -1505,16 +1550,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.size = 8;
     Info.align.reset();
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    return true;
-  }
-  case Intrinsic::amdgcn_global_atomic_csub: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.ptrVal = CI.getOperand(0);
-    Info.align.reset();
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-                  MachineMemOperand::MOVolatile;
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
   case Intrinsic::amdgcn_image_bvh_intersect_ray:
@@ -1530,14 +1567,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.align.reset();
     Info.flags |=
         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
-  case Intrinsic::amdgcn_flat_atomic_fmax_num:
-  case Intrinsic::amdgcn_atomic_cond_sub_u32: {
+  case Intrinsic::amdgcn_flat_atomic_fmax_num: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -1545,14 +1582,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
                   MachineMemOperand::MODereferenceable |
                   MachineMemOperand::MOVolatile;
-    return true;
+    Infos.push_back(Info);
+    return;
   }
-  case Intrinsic::amdgcn_flat_load_monitor_b32:
-  case Intrinsic::amdgcn_flat_load_monitor_b64:
-  case Intrinsic::amdgcn_flat_load_monitor_b128:
-  case Intrinsic::amdgcn_global_load_monitor_b32:
-  case Intrinsic::amdgcn_global_load_monitor_b64:
-  case Intrinsic::amdgcn_global_load_monitor_b128:
   case Intrinsic::amdgcn_cluster_load_b32:
   case Intrinsic::amdgcn_cluster_load_b64:
   case Intrinsic::amdgcn_cluster_load_b128:
@@ -1573,7 +1605,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = CI.getOperand(0);
     Info.align.reset();
     Info.flags |= MachineMemOperand::MOLoad;
-    return true;
+    Infos.push_back(Info);
+    return;
+  }
+  case Intrinsic::amdgcn_flat_load_monitor_b32:
+  case Intrinsic::amdgcn_flat_load_monitor_b64:
+  case Intrinsic::amdgcn_flat_load_monitor_b128:
+  case Intrinsic::amdgcn_global_load_monitor_b32:
+  case Intrinsic::amdgcn_global_load_monitor_b64:
+  case Intrinsic::amdgcn_global_load_monitor_b128: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+    Info.ptrVal = CI.getOperand(0);
+    Info.align.reset();
+    Info.flags = MachineMemOperand::MOLoad;
+    Info.order = parseAtomicOrderingCABIArg(CI, 1);
+    Info.ssid = parseSyncscopeMDArg(CI, 2);
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
   case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
@@ -1582,8 +1631,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
     Info.ptrVal = CI.getOperand(0);
     Info.align.reset();
-    getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
-    return true;
+    Info.flags = (MachineMemOperand::MOLoad | MOCooperative);
+    Info.order = parseAtomicOrderingCABIArg(CI, 1);
+    Info.ssid = parseSyncscopeMDArg(CI, 2);
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
   case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
@@ -1592,8 +1644,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
     Info.ptrVal = CI.getArgOperand(0);
     Info.align.reset();
-    getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
-    return true;
+    Info.flags = (MachineMemOperand::MOStore | MOCooperative);
+    Info.order = parseAtomicOrderingCABIArg(CI, 2);
+    Info.ssid = parseSyncscopeMDArg(CI, 3);
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
@@ -1618,7 +1673,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Info.flags |= MachineMemOperand::MOLoad;
     else
       Info.flags |= MachineMemOperand::MOStore;
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_global_load_async_to_lds_b8:
   case Intrinsic::amdgcn_global_load_async_to_lds_b32:
@@ -1628,30 +1684,68 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
   case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
   case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
+    // Entry 0: Load from source (global/flat).
     Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
-    Info.ptrVal = CI.getArgOperand(1);
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    return true;
+    Info.ptrVal = CI.getArgOperand(0); // Global pointer
+    Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+    Info.flags |= MachineMemOperand::MOLoad;
+    Infos.push_back(Info);
+
+    // Entry 1: Store to LDS (same offset).
+    Info.flags &= ~MachineMemOperand::MOLoad;
+    Info.flags |= MachineMemOperand::MOStore;
+    Info.ptrVal = CI.getArgOperand(1); // LDS pointer
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_global_store_async_from_lds_b8:
   case Intrinsic::amdgcn_global_store_async_from_lds_b32:
   case Intrinsic::amdgcn_global_store_async_from_lds_b64:
   case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+    // Entry 0: Load from LDS.
     Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
-    Info.ptrVal = CI.getArgOperand(0);
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    return true;
+    Info.ptrVal = CI.getArgOperand(1); // LDS pointer
+    Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
+    Info.flags |= MachineMemOperand::MOLoad;
+    Infos.push_back(Info);
+
+    // Entry 1: Store to global (same offset).
+    Info.flags &= ~MachineMemOperand::MOLoad;
+    Info.flags |= MachineMemOperand::MOStore;
+    Info.ptrVal = CI.getArgOperand(0); // Global pointer
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_load_to_lds:
-  case Intrinsic::amdgcn_global_load_lds: {
-    Info.opc = ISD::INTRINSIC_VOID;
+  case Intrinsic::amdgcn_load_async_to_lds:
+  case Intrinsic::amdgcn_global_load_lds:
+  case Intrinsic::amdgcn_global_load_async_lds: {
     unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+    auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+    bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
+
+    // Entry 0: Load from source (global/flat).
+    Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-    Info.ptrVal = CI.getArgOperand(1);
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    return true;
+    Info.ptrVal = CI.getArgOperand(0); // Source pointer
+    Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
+    Info.flags |= MachineMemOperand::MOLoad;
+    if (IsVolatile)
+      Info.flags |= MachineMemOperand::MOVolatile;
+    Infos.push_back(Info);
+
+    // Entry 1: Store to LDS.
+    // Same offset from the instruction, but an additional per-lane offset is
+    // added. Represent that using a wider memory type.
+    Info.memVT = EVT::getIntegerVT(CI.getContext(),
+                                   Width * 8 * Subtarget->getWavefrontSize());
+    Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
+    Info.flags &= ~MachineMemOperand::MOLoad;
+    Info.flags |= MachineMemOperand::MOStore;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
   case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
@@ -1671,7 +1765,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.align = Align(4);
 
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   case Intrinsic::amdgcn_s_prefetch_data:
   case Intrinsic::amdgcn_flat_prefetch:
@@ -1680,10 +1775,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
     Info.ptrVal = CI.getArgOperand(0);
     Info.flags |= MachineMemOperand::MOLoad;
-    return true;
+    Infos.push_back(Info);
+    return;
   }
   default:
-    return false;
+    return;
   }
 }
 
@@ -1709,7 +1805,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
                                             Type *&AccessTy) const {
   Value *Ptr = nullptr;
   switch (II->getIntrinsicID()) {
-  case Intrinsic::amdgcn_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_cluster_load_b128:
   case Intrinsic::amdgcn_cluster_load_b64:
   case Intrinsic::amdgcn_cluster_load_b32:
@@ -1729,16 +1824,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
-  case Intrinsic::amdgcn_flat_load_monitor_b128:
-  case Intrinsic::amdgcn_flat_load_monitor_b32:
-  case Intrinsic::amdgcn_flat_load_monitor_b64:
-  case Intrinsic::amdgcn_global_atomic_csub:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
-  case Intrinsic::amdgcn_global_load_monitor_b128:
-  case Intrinsic::amdgcn_global_load_monitor_b32:
-  case Intrinsic::amdgcn_global_load_monitor_b64:
   case Intrinsic::amdgcn_global_load_tr_b64:
   case Intrinsic::amdgcn_global_load_tr_b128:
   case Intrinsic::amdgcn_global_load_tr4_b64:
@@ -1750,7 +1838,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
     Ptr = II->getArgOperand(0);
     break;
   case Intrinsic::amdgcn_load_to_lds:
+  case Intrinsic::amdgcn_load_async_to_lds:
   case Intrinsic::amdgcn_global_load_lds:
+  case Intrinsic::amdgcn_global_load_async_lds:
   case Intrinsic::amdgcn_global_load_async_to_lds_b8:
   case Intrinsic::amdgcn_global_load_async_to_lds_b32:
   case Intrinsic::amdgcn_global_load_async_to_lds_b64:
@@ -1917,7 +2007,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   }
 
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return Subtarget->enableFlatScratch()
+    return Subtarget->hasFlatScratchEnabled()
                ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS)
                : isLegalMUBUFAddressingMode(AM);
 
@@ -1980,7 +2070,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
 
     Align RequiredAlignment(
         PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
-    if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
+    if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
         Alignment < RequiredAlignment)
       return false;
 
@@ -2229,7 +2319,8 @@ bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
   // TODO: This should be more aggressive, particular for 16-bit element
   // vectors. However there are some mixed improvements and regressions.
   EVT EltTy = VT.getVectorElementType();
-  return EltTy.getSizeInBits() % 32 == 0;
+  unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
+  return EltTy.getSizeInBits() % MinAlign == 0;
 }
 
 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
@@ -2251,6 +2342,14 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
   return TargetLowering::isTypeDesirableForOp(Op, VT);
 }
 
+MachinePointerInfo
+SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
+  // This isn't really a constant pool but close enough.
+  MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
+  PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
+  return PtrInfo;
+}
+
 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
                                                    const SDLoc &SL,
                                                    SDValue Chain,
@@ -2313,9 +2412,16 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
   }
 
-  if (MemVT.isFloatingPoint())
-    Val = getFPExtOrFPRound(DAG, Val, SL, VT);
-  else if (Signed)
+  if (MemVT.isFloatingPoint()) {
+    if (VT.isFloatingPoint()) {
+      Val = getFPExtOrFPRound(DAG, Val, SL, VT);
+    } else {
+      assert(!MemVT.isVector());
+      EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+      SDValue Cast = DAG.getBitcast(IntVT, Val);
+      Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
+    }
+  } else if (Signed)
     Val = DAG.getSExtOrTrunc(Val, SL, VT);
   else
     Val = DAG.getZExtOrTrunc(Val, SL, VT);
@@ -2327,7 +2433,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
     SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
     uint64_t Offset, Align Alignment, bool Signed,
     const ISD::InputArg *Arg) const {
-  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+
+  MachinePointerInfo PtrInfo =
+      getKernargSegmentPtrInfo(DAG.getMachineFunction());
 
   // Try to avoid using an extload by loading earlier than the argument address,
   // and extracting the relevant bits. The load should hopefully be merged with
@@ -2342,7 +2450,8 @@ SDValue SITargetLowering::lowerKernargMemParameter(
     // TODO: If we passed in the base kernel offset we could have a better
     // alignment than 4, but we don't really need it.
     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
-    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
+    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
+                               PtrInfo.getWithOffset(AlignDownOffset), Align(4),
                                MachineMemOperand::MODereferenceable |
                                    MachineMemOperand::MOInvariant);
 
@@ -2357,9 +2466,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
   }
 
   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
-  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
-                             MachineMemOperand::MODereferenceable |
-                                 MachineMemOperand::MOInvariant);
+  SDValue Load = DAG.getLoad(
+      MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+      MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
 
   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
   return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
@@ -3023,7 +3132,7 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
                                            CallingConv::ID CallConv,
                                            bool IsShader) const {
   bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
-  if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
+  if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
     // Note: user SGPRs are handled by the front-end for graphics shaders
     // Pad up the used user SGPRs with dead inputs.
 
@@ -3092,7 +3201,7 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
   }
 
-  assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
+  assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
          Info.getNumPreloadedSGPRs() >= 16);
 }
 
@@ -3120,7 +3229,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
   // the scratch registers to pass in.
   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
 
-  if (!ST.enableFlatScratch()) {
+  if (!ST.hasFlatScratchEnabled()) {
     if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -3263,7 +3372,7 @@ SDValue SITargetLowering::LowerFormalArguments(
            !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
            !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
     (void)UserSGPRInfo;
-    if (!Subtarget->enableFlatScratch())
+    if (!Subtarget->hasFlatScratchEnabled())
       assert(!UserSGPRInfo.hasFlatScratchInit());
     if ((CallConv != CallingConv::AMDGPU_CS &&
          CallConv != CallingConv::AMDGPU_Gfx &&
@@ -3334,7 +3443,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
 
     // FIXME: Sink this into allocateSpecialInputSGPRs
-    if (!Subtarget->enableFlatScratch())
+    if (!Subtarget->hasFlatScratchEnabled())
       CCInfo.AllocateReg(Info->getScratchRSrcReg());
 
     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@@ -3559,11 +3668,17 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (IsEntryFunc)
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
 
-  // DAG.getPass() returns nullptr when using new pass manager.
-  // TODO: Use DAG.getMFAM() to access analysis result.
   if (DAG.getPass()) {
-    auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
-    ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
+    auto &ArgUsageInfo =
+        DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
+    ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
+  } else if (auto *MFAM = DAG.getMFAM()) {
+    Module &M = *MF.getFunction().getParent();
+    auto *ArgUsageInfo =
+        MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
+            .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
+    if (ArgUsageInfo)
+      ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
   }
 
   unsigned StackArgSize = CCInfo.getStackSize();
@@ -3778,12 +3893,19 @@ void SITargetLowering::passSpecialInputs(
   const AMDGPUFunctionArgInfo *CalleeArgInfo =
       &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
   if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
-    // DAG.getPass() returns nullptr when using new pass manager.
-    // TODO: Use DAG.getMFAM() to access analysis result.
     if (DAG.getPass()) {
       auto &ArgUsageInfo =
-          DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
-      CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+          DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
+      CalleeArgInfo =
+          &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
+    } else if (auto *MFAM = DAG.getMFAM()) {
+      Module &M = *DAG.getMachineFunction().getFunction().getParent();
+      auto *ArgUsageInfo =
+          MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(
+                  DAG.getMachineFunction())
+              .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
+      if (ArgUsageInfo)
+        CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
     }
   }
 
@@ -4049,7 +4171,7 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   if (!CI->isTailCall())
     return false;
 
-  const Function *ParentFn = CI->getParent()->getParent();
+  const Function *ParentFn = CI->getFunction();
   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
     return false;
   return true;
@@ -4233,7 +4355,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
 
   if (!IsSibCall || IsChainCallConv) {
-    if (!Subtarget->enableFlatScratch()) {
+    if (!Subtarget->hasFlatScratchEnabled()) {
       SmallVector<SDValue, 4> CopyFromChains;
 
       // In the HSA case, this should be an identity copy.
@@ -5058,7 +5180,7 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
   // Compare the just read M0 value to all possible Idx values.
   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
       .addReg(CurrentIdxReg)
-      .addReg(Idx.getReg(), 0, Idx.getSubReg());
+      .addReg(Idx.getReg(), {}, Idx.getSubReg());
 
   // Update EXEC, save the original EXEC value to VCC.
   BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
@@ -5259,7 +5381,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
       setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
 
       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-          .addReg(SrcReg, 0, SubReg)
+          .addReg(SrcReg, {}, SubReg)
           .addReg(SrcReg, RegState::Implicit);
     }
 
@@ -5293,7 +5415,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
         .addImm(SubReg);
   } else {
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-        .addReg(SrcReg, 0, SubReg)
+        .addReg(SrcReg, {}, SubReg)
         .addReg(SrcReg, RegState::Implicit);
   }
 
@@ -5466,6 +5588,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
     return std::numeric_limits<uint32_t>::min();
   case AMDGPU::S_MAX_I32:
     return std::numeric_limits<int32_t>::min();
+  case AMDGPU::V_ADD_F32_e64: // -0.0
+    return 0x80000000;
+  case AMDGPU::V_SUB_F32_e64: // +0.0
+    return 0x0;
   case AMDGPU::S_ADD_I32:
   case AMDGPU::S_SUB_I32:
   case AMDGPU::S_OR_B32:
@@ -5473,6 +5599,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
     return std::numeric_limits<uint32_t>::min();
   case AMDGPU::S_AND_B32:
     return std::numeric_limits<uint32_t>::max();
+  case AMDGPU::V_MIN_F32_e64:
+  case AMDGPU::V_MAX_F32_e64:
+    return 0x7fc00000; // qNAN
   default:
     llvm_unreachable(
         "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
@@ -5489,6 +5618,11 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
     return std::numeric_limits<uint64_t>::min();
   case AMDGPU::V_CMP_GT_I64_e64: // max.i64
     return std::numeric_limits<int64_t>::min();
+  case AMDGPU::V_MIN_F64_e64:
+  case AMDGPU::V_MAX_F64_e64:
+  case AMDGPU::V_MIN_NUM_F64_e64:
+  case AMDGPU::V_MAX_NUM_F64_e64:
+    return 0x7FF8000000000000; // qNAN
   case AMDGPU::S_ADD_U64_PSEUDO:
   case AMDGPU::S_SUB_U64_PSEUDO:
   case AMDGPU::S_OR_B64:
@@ -5496,6 +5630,9 @@ static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
     return std::numeric_limits<uint64_t>::min();
   case AMDGPU::S_AND_B64:
     return std::numeric_limits<uint64_t>::max();
+  case AMDGPU::V_ADD_F64_e64:
+  case AMDGPU::V_ADD_F64_pseudo_e64:
+    return 0x8000000000000000; // -0.0
   default:
     llvm_unreachable(
         "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
@@ -5507,7 +5644,17 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
          Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
          Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
          Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
-         Opc == AMDGPU::S_XOR_B32;
+         Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
+         Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+         Opc == AMDGPU::V_SUB_F32_e64;
+}
+
+static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
+  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+         Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
+         Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
+         Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
+         Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
 }
 
 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5528,8 +5675,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     switch (Opc) {
     case AMDGPU::S_MIN_U32:
     case AMDGPU::S_MIN_I32:
+    case AMDGPU::V_MIN_F32_e64:
     case AMDGPU::S_MAX_U32:
     case AMDGPU::S_MAX_I32:
+    case AMDGPU::V_MAX_F32_e64:
     case AMDGPU::S_AND_B32:
     case AMDGPU::S_OR_B32: {
       // Idempotent operations.
@@ -5541,6 +5690,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     case AMDGPU::V_CMP_LT_I64_e64: // min
     case AMDGPU::V_CMP_GT_U64_e64: // umax
     case AMDGPU::V_CMP_GT_I64_e64: // max
+    case AMDGPU::V_MIN_F64_e64:
+    case AMDGPU::V_MIN_NUM_F64_e64:
+    case AMDGPU::V_MAX_F64_e64:
+    case AMDGPU::V_MAX_NUM_F64_e64:
     case AMDGPU::S_AND_B64:
     case AMDGPU::S_OR_B64: {
       // Idempotent operations.
@@ -5552,8 +5705,12 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     case AMDGPU::S_XOR_B64:
     case AMDGPU::S_ADD_I32:
     case AMDGPU::S_ADD_U64_PSEUDO:
+    case AMDGPU::V_ADD_F32_e64:
+    case AMDGPU::V_ADD_F64_e64:
+    case AMDGPU::V_ADD_F64_pseudo_e64:
     case AMDGPU::S_SUB_I32:
-    case AMDGPU::S_SUB_U64_PSEUDO: {
+    case AMDGPU::S_SUB_U64_PSEUDO:
+    case AMDGPU::V_SUB_F32_e64: {
       const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
       const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
       Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5708,6 +5865,72 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             .addImm(AMDGPU::sub1);
         break;
       }
+      case AMDGPU::V_ADD_F32_e64:
+      case AMDGPU::V_ADD_F64_e64:
+      case AMDGPU::V_ADD_F64_pseudo_e64:
+      case AMDGPU::V_SUB_F32_e64: {
+        bool is32BitOpc = is32bitWaveReduceOperation(Opc);
+        const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
+        Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
+        Register DstVreg = MRI.createVirtualRegister(VregRC);
+        // Get number of active lanes as a float val.
+        BuildMI(BB, MI, DL,
+                TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
+                                    : AMDGPU::V_CVT_F64_I32_e64),
+                ActiveLanesVreg)
+            .addReg(NewAccumulator->getOperand(0).getReg())
+            .addImm(0)  // clamp
+            .addImm(0); // output-modifier
+
+        // Take negation of input for SUB reduction
+        unsigned srcMod =
+            (Opc == AMDGPU::V_SUB_F32_e64 ||
+             MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
+                ? SISrcMods::NEG
+                : SISrcMods::NONE;
+        unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
+                          : ST.getGeneration() >= AMDGPUSubtarget::GFX12
+                              ? AMDGPU::V_MUL_F64_pseudo_e64
+                              : AMDGPU::V_MUL_F64_e64;
+        auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
+                                    DstVreg)
+                                .addImm(srcMod) // src0 modifier
+                                .addReg(SrcReg)
+                                .addImm(SISrcMods::NONE) // src1 modifier
+                                .addReg(ActiveLanesVreg)
+                                .addImm(SISrcMods::NONE)  // clamp
+                                .addImm(SISrcMods::NONE); // output-mod
+        if (is32BitOpc) {
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+              .addReg(DstVreg);
+        } else {
+          Register LaneValueLoReg =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+          Register LaneValueHiReg =
+              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+          const TargetRegisterClass *VregSubRC =
+              TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
+          MachineOperand Op1L =
+              TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
+                                           VregRC, AMDGPU::sub0, VregSubRC);
+          MachineOperand Op1H =
+              TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
+                                           VregRC, AMDGPU::sub1, VregSubRC);
+          // lane value input should be in an sgpr
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                  LaneValueLoReg)
+              .add(Op1L);
+          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                  LaneValueHiReg)
+              .add(Op1H);
+          NewAccumulator =
+              BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+                  .addReg(LaneValueLoReg)
+                  .addImm(AMDGPU::sub0)
+                  .addReg(LaneValueHiReg)
+                  .addImm(AMDGPU::sub1);
+        }
+      }
       }
       RetBB = &BB;
     }
@@ -5725,6 +5948,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     MachineBasicBlock::iterator I = BB.end();
     Register SrcReg = MI.getOperand(1).getReg();
     bool is32BitOpc = is32bitWaveReduceOperation(Opc);
+    bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
 
     // Create Control flow for loop
     // Split MI's Machine Basic block into For loop
@@ -5753,7 +5977,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
       BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
           .addImm(IdentityValue);
     } else {
-      uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
+      uint64_t IdentityValue =
+          MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+              ? 0x0 // +0.0 for double sub reduction
+              : getIdentityValueFor64BitWaveReduction(Opc);
       BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
           .addImm(IdentityValue);
     }
@@ -5784,9 +6011,29 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
               LaneValueReg)
           .addReg(SrcReg)
           .addReg(FF1Reg);
-      NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
-                           .addReg(Accumulator->getOperand(0).getReg())
-                           .addReg(LaneValueReg);
+      if (isFPOp) {
+        Register LaneValVreg =
+            MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+        Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+        // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+                LaneValVreg)
+            .addReg(LaneValueReg);
+        BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+            .addImm(0) // src0 modifier
+            .addReg(Accumulator->getOperand(0).getReg())
+            .addImm(0) // src1 modifier
+            .addReg(LaneValVreg)
+            .addImm(0)  // clamp
+            .addImm(0); // omod
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+                                 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+                             .addReg(DstVreg);
+      } else {
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+                             .addReg(Accumulator->getOperand(0).getReg())
+                             .addReg(LaneValueReg);
+      }
     } else {
       Register LaneValueLoReg =
           MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5832,7 +6079,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
         Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
         Register ComparisonResultReg =
             MRI.createVirtualRegister(WaveMaskRegClass);
-        const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
+        int SrcIdx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+        const TargetRegisterClass *VregClass =
+            TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
         const TargetRegisterClass *VSubRegClass =
             TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
         Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
@@ -5863,6 +6113,60 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                              .addReg(Accumulator->getOperand(0).getReg());
         break;
       }
+      case AMDGPU::V_MIN_F64_e64:
+      case AMDGPU::V_MIN_NUM_F64_e64:
+      case AMDGPU::V_MAX_F64_e64:
+      case AMDGPU::V_MAX_NUM_F64_e64:
+      case AMDGPU::V_ADD_F64_e64:
+      case AMDGPU::V_ADD_F64_pseudo_e64: {
+        int SrcIdx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+        const TargetRegisterClass *VregRC =
+            TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
+        const TargetRegisterClass *VregSubRC =
+            TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
+        Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
+        Register DstVreg = MRI.createVirtualRegister(VregRC);
+        Register LaneValLo =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+        Register LaneValHi =
+            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
+            .addReg(Accumulator->getOperand(0).getReg());
+        unsigned Modifier =
+            MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+                ? SISrcMods::NEG
+                : SISrcMods::NONE;
+        auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+                               .addImm(Modifier) // src0 modifiers
+                               .addReg(LaneValue->getOperand(0).getReg())
+                               .addImm(SISrcMods::NONE) // src1 modifiers
+                               .addReg(AccumulatorVReg)
+                               .addImm(SISrcMods::NONE)  // clamp
+                               .addImm(SISrcMods::NONE); // omod
+        auto ReadLaneLo =
+            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                    LaneValLo);
+        auto ReadLaneHi =
+            BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                    LaneValHi);
+        MachineBasicBlock::iterator Iters = *ReadLaneLo;
+        MachineOperand Op1L =
+            TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
+                                         VregRC, AMDGPU::sub0, VregSubRC);
+        MachineOperand Op1H =
+            TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
+                                         VregRC, AMDGPU::sub1, VregSubRC);
+        ReadLaneLo.add(Op1L);
+        ReadLaneHi.add(Op1H);
+        NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+                                 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+                             .addReg(LaneValLo)
+                             .addImm(AMDGPU::sub0)
+                             .addReg(LaneValHi)
+                             .addImm(AMDGPU::sub1);
+        break;
+      }
       case AMDGPU::S_ADD_U64_PSEUDO:
       case AMDGPU::S_SUB_U64_PSEUDO: {
         NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
@@ -5918,6 +6222,13 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
   case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
+  case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
+  case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(),
+                           ST.getGeneration() >= AMDGPUSubtarget::GFX12
+                               ? AMDGPU::V_MIN_NUM_F64_e64
+                               : AMDGPU::V_MIN_F64_e64);
   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
   case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
@@ -5926,14 +6237,37 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
   case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
+  case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
+  case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(),
+                           ST.getGeneration() >= AMDGPUSubtarget::GFX12
+                               ? AMDGPU::V_MAX_NUM_F64_e64
+                               : AMDGPU::V_MAX_F64_e64);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
+  case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(),
+                           ST.getGeneration() >= AMDGPUSubtarget::GFX12
+                               ? AMDGPU::V_ADD_F64_pseudo_e64
+                               : AMDGPU::V_ADD_F64_e64);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
+  case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
+    // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
+    // fadd + neg, by setting the NEG bit in the instruction.
+    return lowerWaveReduce(MI, *BB, *getSubtarget(),
+                           ST.getGeneration() >= AMDGPUSubtarget::GFX12
+                               ? AMDGPU::V_ADD_F64_pseudo_e64
+                               : AMDGPU::V_ADD_F64_e64);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
@@ -6203,7 +6537,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   }
   case AMDGPU::SI_INDIRECT_SRC_V1:
   case AMDGPU::SI_INDIRECT_SRC_V2:
+  case AMDGPU::SI_INDIRECT_SRC_V3:
   case AMDGPU::SI_INDIRECT_SRC_V4:
+  case AMDGPU::SI_INDIRECT_SRC_V5:
+  case AMDGPU::SI_INDIRECT_SRC_V6:
+  case AMDGPU::SI_INDIRECT_SRC_V7:
   case AMDGPU::SI_INDIRECT_SRC_V8:
   case AMDGPU::SI_INDIRECT_SRC_V9:
   case AMDGPU::SI_INDIRECT_SRC_V10:
@@ -6214,7 +6552,11 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitIndirectSrc(MI, *BB, *getSubtarget());
   case AMDGPU::SI_INDIRECT_DST_V1:
   case AMDGPU::SI_INDIRECT_DST_V2:
+  case AMDGPU::SI_INDIRECT_DST_V3:
   case AMDGPU::SI_INDIRECT_DST_V4:
+  case AMDGPU::SI_INDIRECT_DST_V5:
+  case AMDGPU::SI_INDIRECT_DST_V6:
+  case AMDGPU::SI_INDIRECT_DST_V7:
   case AMDGPU::SI_INDIRECT_DST_V8:
   case AMDGPU::SI_INDIRECT_DST_V9:
   case AMDGPU::SI_INDIRECT_DST_V10:
@@ -6344,8 +6686,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case AMDGPU::DS_GWS_INIT:
   case AMDGPU::DS_GWS_SEMA_BR:
   case AMDGPU::DS_GWS_BARRIER:
-    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
-    [[fallthrough]];
   case AMDGPU::DS_GWS_SEMA_V:
   case AMDGPU::DS_GWS_SEMA_P:
   case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
@@ -6711,6 +7051,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerBRCOND(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
+  case ISD::SPONENTRY:
+    return LowerSPONENTRY(Op, DAG);
   case ISD::LOAD: {
     SDValue Result = LowerLOAD(Op, DAG);
     assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
@@ -6743,6 +7085,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     return LowerGlobalAddress(MFI, Op, DAG);
   }
+  case ISD::ExternalSymbol:
+    return LowerExternalSymbol(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:
@@ -6792,6 +7136,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return splitTernaryVectorOp(Op, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+        Op.getValueType() == MVT::i16 &&
+        Op.getOperand(0).getValueType() == MVT::f32) {
+      // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
+      return Op;
+    }
     return LowerFP_TO_INT(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
@@ -7032,9 +7382,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
   SDLoc SL(N);
 
   if (Src.getOpcode() == ISD::SETCC) {
+    SDValue Op0 = Src.getOperand(0);
+    SDValue Op1 = Src.getOperand(1);
+    // Need to expand bfloat to float for comparison (setcc).
+    if (Op0.getValueType() == MVT::bf16) {
+      Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
+      Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
+    }
     // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
-    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
-                       Src.getOperand(1), Src.getOperand(2));
+    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
   }
   if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
     // (ballot 0) -> 0
@@ -7260,6 +7616,84 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   return DAG.getBitcast(VT, UnrolledLaneOp);
 }
 
+static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
+                                SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (VT.getSizeInBits() != 32)
+    return SDValue();
+
+  SDLoc SL(N);
+
+  SDValue Value = N->getOperand(1);
+  SDValue Index = N->getOperand(2);
+
+  // ds_bpermute requires index to be multiplied by 4
+  SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
+  SDValue ShiftedIndex =
+      DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
+
+  // Intrinsics will require i32 to operate on
+  SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
+
+  auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
+                                   SmallVector<SDValue> IntrinArgs) -> SDValue {
+    SmallVector<SDValue> Operands(1);
+    Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
+    Operands.append(IntrinArgs);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
+  };
+
+  // If we can bpermute across the whole wave, then just do that
+  if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
+    SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+                                     {ShiftedIndex, ValueI32});
+    return DAG.getBitcast(VT, BPermute);
+  }
+
+  assert(TLI.getSubtarget()->isWave64());
+
+  // Otherwise, we need to make use of whole wave mode
+  SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
+
+  // Set inactive lanes to poison
+  SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+                                   {ValueI32, PoisonVal});
+  SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
+                                   {ShiftedIndex, PoisonVal});
+
+  SDValue Swapped =
+      MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
+
+  // Get permutation of each half, then we'll select which one to use
+  SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+                                        {WWMIndex, WWMValue});
+  SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
+                                         MVT::i32, {WWMIndex, Swapped});
+  SDValue BPermOtherHalfWWM =
+      MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
+
+  // Select which side to take the permute from
+  SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
+  // We can get away with only using mbcnt_lo here since we're only
+  // trying to detect which side of 32 each lane is on, and mbcnt_lo
+  // returns 32 for lanes 32-63.
+  SDValue ThreadID =
+      MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
+                    {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
+
+  SDValue SameOrOtherHalf =
+      DAG.getNode(ISD::AND, SL, MVT::i32,
+                  DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
+                  DAG.getTargetConstant(32, SL, MVT::i32));
+  SDValue UseSameHalf =
+      DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
+                   DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
+  SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
+                                 BPermOtherHalfWWM);
+  return DAG.getBitcast(VT, Result);
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
@@ -7632,6 +8066,20 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
 
+SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // For functions that set up their own stack, select the GET_STACK_BASE
+  // pseudo.
+  if (MFI->isBottomOfStack())
+    return Op;
+
+  // For everything else, create a dummy stack object.
+  int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
+  return DAG.getFrameIndex(FI, Op.getValueType());
+}
+
 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
                                             const SDLoc &DL, EVT VT) const {
   return Op.getValueType().bitsLE(VT)
@@ -7701,8 +8149,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
   // Round-inexact-to-odd f64 to f32, then do the final rounding using the
   // hardware f32 -> bf16 instruction.
-  EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
-                                 MVT::f32;
+  EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
   SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
   return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
                      DAG.getTargetConstant(0, DL, MVT::i32));
@@ -7849,14 +8296,13 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
 
   EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
                                  : Op->getOperand(0).getValueType();
-  auto ExtTy = OpTy.changeElementType(MVT::i32);
+  auto &DAG = DCI.DAG;
+  auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
 
   if (DCI.isBeforeLegalizeOps() ||
       isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
     return SDValue();
 
-  auto &DAG = DCI.DAG;
-
   SDLoc DL(Op);
   SDValue LHS;
   SDValue RHS;
@@ -8033,7 +8479,7 @@ SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
-  if (!Subtarget->isTrapHandlerEnabled() ||
+  if (!Subtarget->hasTrapHandler() ||
       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
     return lowerTrapEndpgm(Op, DAG);
 
@@ -8054,10 +8500,11 @@ SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
   MachineFunction &MF = DAG.getMachineFunction();
   uint64_t Offset = getImplicitParameterOffset(MF, Param);
   SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
-  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
-  return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
-                     MachineMemOperand::MODereferenceable |
-                         MachineMemOperand::MOInvariant);
+  MachinePointerInfo PtrInfo =
+      getKernargSegmentPtrInfo(DAG.getMachineFunction());
+  return DAG.getLoad(
+      VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
+      MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
 }
 
 SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
@@ -8115,7 +8562,7 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   MachineFunction &MF = DAG.getMachineFunction();
 
-  if (!Subtarget->isTrapHandlerEnabled() ||
+  if (!Subtarget->hasTrapHandler() ||
       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
     LLVMContext &Ctx = MF.getFunction().getContext();
     Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
@@ -8319,6 +8766,9 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
       Op.getValueType() == MVT::i64) {
     const SIMachineFunctionInfo *Info =
         DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+    if (Info->get32BitAddressHighBits() == 0)
+      return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
+
     SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
     SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
@@ -8847,17 +9297,17 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
     if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
         GV->hasExternalLinkage()) {
-      Type *Ty = GV->getValueType();
+      const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
       // HIP uses an unsized array `extern __shared__ T s[]` or similar
       // zero-sized type in other languages to declare the dynamic shared
       // memory which size is not known at the compile time. They will be
       // allocated by the runtime and placed directly after the static
       // allocated ones. They all share the same offset.
-      if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+      if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
         // Adjust alignment for that dynamic shared memory array.
         Function &F = DAG.getMachineFunction().getFunction();
-        MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
+        MFI->setDynLDSAlign(F, GVar);
         MFI->setUsesDynamicLDS(true);
         return SDValue(
             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
@@ -8912,6 +9362,15 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                          MachineMemOperand::MOInvariant);
 }
 
+SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
+  const Function &Fn = DAG.getMachineFunction().getFunction();
+  DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+      Fn, "unsupported external symbol", Op.getDebugLoc()));
+  return DAG.getPOISON(Op.getValueType());
+}
+
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
                                    const SDLoc &DL, SDValue V) const {
   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
@@ -9131,16 +9590,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   SDLoc DL(Op);
   MachineFunction &MF = DAG.getMachineFunction();
   const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  unsigned IntrOpcode = Intr->BaseOpcode;
+  // For image atomic: use no-return opcode if result is unused.
+  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
+      !Op.getNode()->hasAnyUseOfValue(0))
+    IntrOpcode = Intr->AtomicNoRetBaseOpcode;
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-      AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  unsigned IntrOpcode = Intr->BaseOpcode;
   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
   bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
   bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
 
   SmallVector<EVT, 3> ResultTypes(Op->values());
   SmallVector<EVT, 3> OrigResultTypes(Op->values());
+  if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
+    ResultTypes.erase(&ResultTypes[0]);
+
   bool IsD16 = false;
   bool IsG16 = false;
   bool IsA16 = false;
@@ -9159,8 +9625,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     VData = Op.getOperand(2);
 
     IsAtomicPacked16Bit =
-        (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
-         Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+        (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
+         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
 
     bool Is64Bit = VData.getValueSizeInBits() == 64;
     if (BaseOpcode->AtomicX2) {
@@ -9170,7 +9638,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       if (Is64Bit)
         VData = DAG.getBitcast(MVT::v4i32, VData);
 
-      ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+      if (!BaseOpcode->NoReturn)
+        ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+
       DMask = Is64Bit ? 0xf : 0x3;
       NumVDataDwords = Is64Bit ? 4 : 2;
     } else {
@@ -9396,8 +9866,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   }
 
   unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
-  if (BaseOpcode->Atomic)
-    CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+  // Keep GLC only when the atomic's result is actually used.
+  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
+    CPol |= AMDGPU::CPol::GLC;
   if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
                AMDGPU::CPol::VOLATILE))
     return Op;
@@ -9509,13 +9980,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     DAG.setNodeMemRefs(NewNode, {MemRef});
   }
 
+  if (BaseOpcode->NoReturn) {
+    if (BaseOpcode->Atomic)
+      return DAG.getMergeValues(
+          {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
+
+    return SDValue(NewNode, 0);
+  }
+
   if (BaseOpcode->AtomicX2) {
     SmallVector<SDValue, 1> Elt;
     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
   }
-  if (BaseOpcode->NoReturn)
-    return SDValue(NewNode, 0);
+
   return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
                            Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
                            NumVDataDwords, IsAtomicPacked16Bit, DL);
@@ -9709,7 +10187,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
   }
   case Intrinsic::amdgcn_kernarg_segment_ptr: {
-    if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
+    if (!AMDGPU::isKernel(MF.getFunction())) {
       // This only makes sense to call in a kernel, so just lower to null.
       return DAG.getConstant(0, DL, VT);
     }
@@ -10110,11 +10588,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     SDLoc SL(Op);
     auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
-                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
-                        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
-                        IndexKey, Op.getOperand(7),
-                        Op.getOperand(8)}); // No clamp operand
+    SmallVector<SDValue> Args{
+        Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+        IndexKey,         Op.getOperand(7), Op.getOperand(8)};
+    if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
+      Args.push_back(Op.getOperand(9));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
   }
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
@@ -10148,6 +10628,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       Poisons.push_back(DAG.getPOISON(ValTy));
     return DAG.getMergeValues(Poisons, SDLoc(Op));
   }
+  case Intrinsic::amdgcn_wave_shuffle:
+    return lowerWaveShuffle(*this, Op.getNode(), DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -10455,9 +10937,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
-  case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
-    return lowerRawBufferAtomicIntrin(Op, DAG,
-                                      AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
     return lowerStructBufferAtomicIntrin(Op, DAG,
@@ -10499,10 +10978,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
+  case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
+  case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+    return lowerStructBufferAtomicIntrin(Op, DAG,
+                                         AMDGPUISD::BUFFER_ATOMIC_CSUB);
+  case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+    return lowerRawBufferAtomicIntrin(Op, DAG,
+                                      AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
     return lowerStructBufferAtomicIntrin(Op, DAG,
                                          AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
-
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
     SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -10752,6 +11242,19 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
                          Ops, M->getMemOperand());
   }
+  case Intrinsic::amdgcn_s_alloc_vgpr: {
+    SDValue NumVGPRs = Op.getOperand(2);
+    if (!NumVGPRs->isDivergent())
+      return Op;
+
+    SDValue ReadFirstLaneID =
+        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
+    NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+                           ReadFirstLaneID, NumVGPRs);
+
+    return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
+                       Op.getOperand(0), Op.getOperand(1), NumVGPRs);
+  }
   case Intrinsic::amdgcn_s_get_barrier_state:
   case Intrinsic::amdgcn_s_get_named_barrier_state: {
     SDValue Chain = Op->getOperand(0);
@@ -10794,6 +11297,26 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
                              Chain, Ptr, MII->getMemOperand());
   }
+  case Intrinsic::amdgcn_flat_load_monitor_b32:
+  case Intrinsic::amdgcn_flat_load_monitor_b64:
+  case Intrinsic::amdgcn_flat_load_monitor_b128: {
+    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+    SDValue Chain = Op->getOperand(0);
+    SDValue Ptr = Op->getOperand(2);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
+                                   Op->getVTList(), {Chain, Ptr},
+                                   MII->getMemoryVT(), MII->getMemOperand());
+  }
+  case Intrinsic::amdgcn_global_load_monitor_b32:
+  case Intrinsic::amdgcn_global_load_monitor_b64:
+  case Intrinsic::amdgcn_global_load_monitor_b128: {
+    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+    SDValue Chain = Op->getOperand(0);
+    SDValue Ptr = Op->getOperand(2);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
+                                   Op->getVTList(), {Chain, Ptr},
+                                   MII->getMemoryVT(), MII->getMemOperand());
+  }
   default:
 
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -10932,12 +11455,24 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
   return VData;
 }
 
+static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
+  switch (Intr) {
+  case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
+  case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
+  case Intrinsic::amdgcn_load_async_to_lds:
+  case Intrinsic::amdgcn_global_load_async_lds:
+    return true;
+  }
+  return false;
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   unsigned IntrinsicID = Op.getConstantOperandVal(1);
-  MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp_compr: {
@@ -11128,15 +11663,21 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                    M->getMemoryVT(), M->getMemOperand());
   }
   case Intrinsic::amdgcn_raw_buffer_load_lds:
+  case Intrinsic::amdgcn_raw_buffer_load_async_lds:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
   case Intrinsic::amdgcn_struct_buffer_load_lds:
-  case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+  case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+  case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
     if (!Subtarget->hasVMemToLDSLoad())
       return SDValue();
     unsigned Opc;
     bool HasVIndex =
         IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
-        IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
+        IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
+        IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
+        IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
     unsigned OpOffset = HasVIndex ? 1 : 0;
     SDValue VOffset = Op.getOperand(5 + OpOffset);
     bool HasVOffset = !isNullConstant(VOffset);
@@ -11208,33 +11749,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
             ? 1
             : 0,
         DL, MVT::i8));                                           // swz
+    Ops.push_back(
+        DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
     Ops.push_back(M0Val.getValue(0));                            // Chain
     Ops.push_back(M0Val.getValue(1));                            // Glue
 
     auto *M = cast<MemSDNode>(Op);
-    MachineMemOperand *LoadMMO = M->getMemOperand();
-    // Don't set the offset value here because the pointer points to the base of
-    // the buffer.
-    MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-
-    MachinePointerInfo StorePtrI = LoadPtrI;
-    LoadPtrI.V = PoisonValue::get(
-        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
-    LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
-    StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
-
-    auto F = LoadMMO->getFlags() &
-             ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
-    LoadMMO =
-        MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
-                                LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
-    MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
-        StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
-        LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-
     auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
-    DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+    DAG.setNodeMemRefs(Load, M->memoperands());
 
     return SDValue(Load, 0);
   }
@@ -11242,7 +11764,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   // for "trust me" that the remaining cases are global pointers until
   // such time as we can put two mem operands on an intrinsic.
   case Intrinsic::amdgcn_load_to_lds:
-  case Intrinsic::amdgcn_global_load_lds: {
+  case Intrinsic::amdgcn_load_async_to_lds:
+  case Intrinsic::amdgcn_global_load_lds:
+  case Intrinsic::amdgcn_global_load_async_lds: {
     if (!Subtarget->hasVMemToLDSLoad())
       return SDValue();
 
@@ -11307,30 +11831,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
 
     Ops.push_back(Op.getOperand(5));  // Offset
-    Ops.push_back(Op.getOperand(6));  // CPol
+
+    unsigned Aux = Op.getConstantOperandVal(6);
+    Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
+                                        MVT::i32)); // CPol
+    Ops.push_back(
+        DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
+
     Ops.push_back(M0Val.getValue(0)); // Chain
     Ops.push_back(M0Val.getValue(1)); // Glue
 
     auto *M = cast<MemSDNode>(Op);
-    MachineMemOperand *LoadMMO = M->getMemOperand();
-    MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-    LoadPtrI.Offset = Op->getConstantOperandVal(5);
-    MachinePointerInfo StorePtrI = LoadPtrI;
-    LoadPtrI.V = PoisonValue::get(
-        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
-    LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
-    StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
-    auto F = LoadMMO->getFlags() &
-             ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
-    LoadMMO =
-        MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
-                                LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
-    MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
-        StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
-        LoadMMO->getAAInfo());
-
     auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
-    DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
+    DAG.setNodeMemRefs(Load, M->memoperands());
 
     return SDValue(Load, 0);
   }
@@ -11375,6 +11888,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
     return SDValue(NewMI, 0);
   }
+  case Intrinsic::amdgcn_s_wakeup_barrier: {
+    if (!Subtarget->hasSWakeupBarrier())
+      return SDValue();
+    [[fallthrough]];
+  }
   case Intrinsic::amdgcn_s_barrier_join: {
     // these three intrinsics have one operand: barrier pointer
     SDValue Chain = Op->getOperand(0);
@@ -11384,16 +11902,32 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     if (isa<ConstantSDNode>(BarOp)) {
       uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
-      Opc = AMDGPU::S_BARRIER_JOIN_IMM;
-
+      switch (IntrinsicID) {
+      default:
+        return SDValue();
+      case Intrinsic::amdgcn_s_barrier_join:
+        Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+        break;
+      case Intrinsic::amdgcn_s_wakeup_barrier:
+        Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
+        break;
+      }
       // extract the BarrierID from bits 4-9 of the immediate
       unsigned BarID = (BarVal >> 4) & 0x3F;
       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
       Ops.push_back(K);
       Ops.push_back(Chain);
     } else {
-      Opc = AMDGPU::S_BARRIER_JOIN_M0;
-
+      switch (IntrinsicID) {
+      default:
+        return SDValue();
+      case Intrinsic::amdgcn_s_barrier_join:
+        Opc = AMDGPU::S_BARRIER_JOIN_M0;
+        break;
+      case Intrinsic::amdgcn_s_wakeup_barrier:
+        Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
+        break;
+      }
       // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
       SDValue M0Val;
       M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
@@ -11482,7 +12016,7 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
     // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
     // being added, so we can only safely match a 32-bit addition with no
     // unsigned overflow.
-    bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
+    bool CheckNUW = Subtarget->hasGFX1250Insts();
     if (!CheckNUW || isNoUnsignedWrap(N0)) {
       C1 = cast<ConstantSDNode>(N0.getOperand(1));
       N0 = N0.getOperand(0);
@@ -11542,11 +12076,15 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
     }
   }
   if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
+    // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
+    // being added, so we can only safely match a 32-bit addition with no
+    // unsigned overflow.
+    bool CheckNUW = Subtarget->hasGFX1250Insts();
     SDValue N0 = CombinedOffset.getOperand(0);
     SDValue N1 = CombinedOffset.getOperand(1);
     uint32_t SOffset, ImmOffset;
     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
-    if (Offset >= 0 &&
+    if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
         TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
       Offsets[0] = N0;
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
@@ -11845,7 +12383,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   Align Alignment = Load->getAlign();
   unsigned AS = Load->getAddressSpace();
-  if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
+  if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
+      AS == AMDGPUAS::FLAT_ADDRESS &&
       Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
     return SplitVectorLoad(Op, DAG);
   }
@@ -11866,7 +12405,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
       (AS == AMDGPUAS::GLOBAL_ADDRESS &&
        Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
-       isMemOpHasNoClobberedMemOperand(Load))) {
+       (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
     if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
         Alignment >= Align(4) && NumElements < 32) {
       if (MemVT.isPow2VectorType() ||
@@ -12161,7 +12700,10 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
 
-  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
+  // TODO: The combiner should probably handle elimination of redundant fabs.
+  SDValue r1 = DAG.SignBitIsZeroFP(RHS)
+                   ? RHS
+                   : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
 
   const APFloat K0Val(0x1p+96f);
   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
@@ -12466,7 +13008,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
          Store->getValue().getValueType().getScalarType() == MVT::i32);
 
   unsigned AS = Store->getAddressSpace();
-  if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
+  if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
+      AS == AMDGPUAS::FLAT_ADDRESS &&
       Store->getAlign().value() < VT.getStoreSize() &&
       VT.getSizeInBits() > 32) {
     return SplitVectorStore(Op, DAG);
@@ -12506,7 +13049,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
     case 16:
       if (NumElements > 4 ||
-          (NumElements == 3 && !Subtarget->enableFlatScratch()))
+          (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
         return SplitVectorStore(Op, DAG);
       return SDValue();
     default:
@@ -12728,23 +13271,36 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   // if Arg is already the result of a multiply by constant.
   auto Flags = Op->getFlags();
 
+  // AMDGPUISD nodes of vector type must be unrolled here since
+  // they will not be expanded elsewhere.
+  auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
+    if (!V.getValueType().isVector())
+      return V;
+
+    return DAG.UnrollVectorOp(cast<SDNode>(V));
+  };
+
   SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
 
   if (Subtarget->hasTrigReducedRange()) {
     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
-    TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
+    TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
   } else {
     TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
   }
 
   switch (Op.getOpcode()) {
   case ISD::FCOS:
-    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
+    TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
+    break;
   case ISD::FSIN:
-    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
+    TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
+    break;
   default:
     llvm_unreachable("Wrong trig opcode");
   }
+
+  return UnrollIfVec(TrigVal);
 }
 
 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
@@ -13398,6 +13954,7 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
     return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
   }
 
+  case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND_INREG: {
@@ -13904,6 +14461,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     assert(OtherOp.getValueSizeInBits() == 32);
   }
 
+  // Check that we haven't just recreated the same FSHR node.
+  if (N->getOpcode() == ISD::FSHR &&
+      (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
+      (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
+    return SDValue();
+
   if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
 
     assert(Op.getValueType().isByteSized() &&
@@ -14181,10 +14744,11 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
   return SDValue();
 }
 
-SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
-                                                   DAGCombinerInfo &DCI) const {
+SDValue
+SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
   if (!Subtarget->has16BitInsts() ||
-      DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      DCI.getDAGCombineLevel() < AfterLegalizeTypes)
     return SDValue();
 
   EVT VT = N->getValueType(0);
@@ -14195,7 +14759,44 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
   if (Src.getValueType() != MVT::i16)
     return SDValue();
 
-  return SDValue();
+  if (!Src->hasOneUse())
+    return SDValue();
+
+  // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
+  // possible we're missing out on some combine opportunities, but we'd need to
+  // weigh the cost of extracting the byte from the upper dwords.
+
+  std::optional<ByteProvider<SDValue>> BP0 =
+      calculateByteProvider(SDValue(N, 0), 0, 0, 0);
+  if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
+    return SDValue();
+  SDValue V0 = *BP0->Src;
+
+  std::optional<ByteProvider<SDValue>> BP1 =
+      calculateByteProvider(SDValue(N, 0), 1, 0, 1);
+  if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
+    return SDValue();
+
+  SDValue V1 = *BP1->Src;
+
+  if (V0 == V1)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  uint32_t PermMask = 0x0c0c0c0c;
+  if (V0) {
+    V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
+    PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
+  }
+
+  if (V1) {
+    V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
+    PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
+  }
+
+  return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
+                     DAG.getConstant(PermMask, DL, MVT::i32));
 }
 
 SDValue
@@ -14299,6 +14900,7 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
 }
 
 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                                       SDNodeFlags UserFlags,
                                        unsigned MaxDepth) const {
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::FCANONICALIZE)
@@ -14498,7 +15100,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
 
   // FIXME: denormalsEnabledForType is broken for dynamic
   return denormalsEnabledForType(DAG, Op.getValueType()) &&
-         DAG.isKnownNeverSNaN(Op);
+         (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
 }
 
 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
@@ -14993,8 +15595,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // for some types, but at a higher cost since it's implemented with a 3
   // operand form.
   const SDNodeFlags Flags = N->getFlags();
-  if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
-      !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
+  if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
+      !Subtarget->hasIEEEMinimumMaximumInsts() &&
+      isOperationLegal(ISD::FMINNUM_IEEE, VT.getScalarType())) {
     unsigned NewOpc =
         Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
     return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
@@ -16335,7 +16938,9 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   SDLoc SL(N);
   EVT VT = N->getValueType(0);
-  if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
+
+  // fsqrt legality correlates to rsq availability.
+  if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
@@ -16370,7 +16975,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
   EVT ScalarVT = VT.getScalarType();
-  EVT IntVT = VT.changeElementType(MVT::i32);
+  EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
 
   if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
       (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
@@ -16548,7 +17153,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
         LHS.getOpcode() == ISD::SELECT &&
         isa<ConstantSDNode>(LHS.getOperand(1)) &&
         isa<ConstantSDNode>(LHS.getOperand(2)) &&
-        LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
         isBoolSGPR(LHS.getOperand(0))) {
       // Given CT != FT:
       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
@@ -16558,13 +17162,34 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
       const APInt &CT = LHS.getConstantOperandAPInt(1);
       const APInt &CF = LHS.getConstantOperandAPInt(2);
 
-      if ((CF == CRHSVal && CC == ISD::SETEQ) ||
-          (CT == CRHSVal && CC == ISD::SETNE))
-        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
-                           DAG.getAllOnesConstant(SL, MVT::i1));
-      if ((CF == CRHSVal && CC == ISD::SETNE) ||
-          (CT == CRHSVal && CC == ISD::SETEQ))
-        return LHS.getOperand(0);
+      if (CT != CF) {
+        if ((CF == CRHSVal && CC == ISD::SETEQ) ||
+            (CT == CRHSVal && CC == ISD::SETNE))
+          return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
+        if ((CF == CRHSVal && CC == ISD::SETNE) ||
+            (CT == CRHSVal && CC == ISD::SETEQ))
+          return LHS.getOperand(0);
+      }
+    }
+
+    // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge
+    //    => setcc v.hi32, 0xXXXX'XXXX, lt/ge
+    //
+    // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt
+    //    => setcc v.hi32, 0xXXXX'XXXX, le/gt
+    if (VT == MVT::i64) {
+      const uint64_t Mask32 = maskTrailingOnes<uint64_t>(32);
+      const uint64_t CRHSInt = CRHSVal.getZExtValue();
+
+      if ( // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge
+          ((CRHSInt & Mask32) == 0 && (CC == ISD::SETULT || CC == ISD::SETUGE ||
+                                       CC == ISD::SETLT || CC == ISD::SETGE)) ||
+          // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt
+          ((CRHSInt & Mask32) == Mask32 &&
+           (CC == ISD::SETULE || CC == ISD::SETUGT || CC == ISD::SETLE ||
+            CC == ISD::SETGT)))
+        return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
+                            DAG.getConstant(CRHSInt >> 32, SL, MVT::i32), CC);
     }
   }
 
@@ -16877,8 +17502,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::XOR:
     return performXorCombine(N, DCI);
+  case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:
-    return performZeroExtendCombine(N, DCI);
+    return performZeroOrAnyExtendCombine(N, DCI);
   case ISD::SIGN_EXTEND_INREG:
     return performSignExtendInRegCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
@@ -17335,12 +17961,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
     // Abandon attempt if the dst size isn't large enough
     // - this is in fact an error but this is picked up elsewhere and
     // reported correctly.
-    uint32_t DstSize =
-        TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+
+    uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
     if (DstSize < InitIdx)
       return;
   } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
-    InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+    InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
   } else {
     return;
   }
@@ -17388,7 +18016,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                      SDNode *Node) const {
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
 
-  MachineFunction *MF = MI.getParent()->getParent();
+  MachineFunction *MF = MI.getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   if (TII->isVOP3(MI.getOpcode())) {
@@ -17524,6 +18152,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
       break;
     case 'v':
       switch (BitWidth) {
+      case 1:
+        return std::pair(0U, nullptr);
       case 16:
         RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
                                              : &AMDGPU::VGPR_32_Lo256RegClass;
@@ -17541,6 +18171,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
       if (!Subtarget->hasMAIInsts())
         break;
       switch (BitWidth) {
+      case 1:
+        return std::pair(0U, nullptr);
       case 16:
         RC = &AMDGPU::AGPR_32RegClass;
         break;
@@ -18050,6 +18682,11 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
     Known.Zero.setHighBits(16);
     break;
+  case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
+    // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
+    // producing exactly 0 or 1.
+    Known.Zero.setHighBits(Known.getBitWidth() - 1);
+    break;
   case AMDGPU::G_AMDGPU_SMED3:
   case AMDGPU::G_AMDGPU_UMED3: {
     auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
@@ -18226,7 +18863,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
   case AMDGPUISD::BUFFER_ATOMIC_INC:
   case AMDGPUISD::BUFFER_ATOMIC_DEC:
   case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
-  case AMDGPUISD::BUFFER_ATOMIC_CSUB:
   case AMDGPUISD::BUFFER_ATOMIC_FADD:
   case AMDGPUISD::BUFFER_ATOMIC_FMIN:
   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
@@ -18378,12 +19014,12 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
   // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
   // allocations work.
   if (HasSystemScope) {
-    if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
+    if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
         RMW->hasMetadata("amdgpu.no.remote.memory"))
       return true;
     if (Subtarget.hasEmulatedSystemScopeAtomics())
       return true;
-  } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+  } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
     return true;
 
   return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
@@ -18413,7 +19049,7 @@ getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
 }
 
 TargetLowering::AtomicExpansionKind
-SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
     return getPrivateAtomicExpansionKind(*getSubtarget());
@@ -18461,7 +19097,19 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   case AtomicRMWInst::UMax:
   case AtomicRMWInst::UMin:
   case AtomicRMWInst::UIncWrap:
-  case AtomicRMWInst::UDecWrap: {
+  case AtomicRMWInst::UDecWrap:
+  case AtomicRMWInst::USubCond:
+  case AtomicRMWInst::USubSat: {
+    if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
+      return AtomicExpansionKind::CmpXChg;
+    if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
+      return AtomicExpansionKind::CmpXChg;
+    if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {
+      auto *IT = dyn_cast<IntegerType>(RMW->getType());
+      if (!IT || IT->getBitWidth() != 32)
+        return AtomicExpansionKind::CmpXChg;
+    }
+
     if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
         AS == AMDGPUAS::BUFFER_FAT_POINTER) {
       if (Subtarget->hasEmulatedSystemScopeAtomics())
@@ -18481,7 +19129,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
       // If fine-grained remote memory works at device scope, we don't need to
       // do anything.
       if (!HasSystemScope &&
-          Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+          Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
         return atomicSupportedIfLegalIntType(RMW);
 
       // If we are targeting a remote allocated address, it depends what kind of
@@ -18500,7 +19148,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
           Op == AtomicRMWInst::Xor) {
         // Atomic sub/or/xor do not work over PCI express, but atomic add
         // does. InstCombine transforms these with 0 to or, so undo that.
-        if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
+        if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
             ConstVal && ConstVal->isNullValue())
           return AtomicExpansionKind::CustomExpand;
       }
@@ -18699,7 +19347,8 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 }
 
 TargetLowering::AtomicExpansionKind
-SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
+SITargetLowering::shouldExpandAtomicCmpXchgInIR(
+    const AtomicCmpXchgInst *CmpX) const {
   unsigned AddrSpace = CmpX->getPointerAddressSpace();
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
     return getPrivateAtomicExpansionKind(*getSubtarget());
@@ -18726,8 +19375,11 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
                                  : &AMDGPU::SReg_32RegClass;
   if (!TRI->isSGPRClass(RC) && !isDivergent)
     return TRI->getEquivalentSGPRClass(RC);
-  if (TRI->isSGPRClass(RC) && isDivergent)
+  if (TRI->isSGPRClass(RC) && isDivergent) {
+    if (Subtarget->hasGFX90AInsts())
+      return TRI->getEquivalentAVClass(RC);
     return TRI->getEquivalentVGPRClass(RC);
+  }
 
   return RC;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 74e58f4..59b8f43 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -45,6 +45,8 @@ public:
     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
     unsigned &NumIntermediates, MVT &RegisterVT) const override;
 
+  MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
+
 private:
   SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
                                    SDValue Chain, uint64_t Offset) const;
@@ -77,6 +79,8 @@ private:
 
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
+  SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
                                  MVT VT, unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
@@ -128,6 +132,7 @@ private:
   SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
                               SelectionDAG &DAG, ArrayRef<SDValue> Ops,
                               bool IsIntrinsic = false) const;
@@ -205,7 +210,7 @@ private:
   SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performZeroOrAnyExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSignExtendInRegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
@@ -332,7 +337,7 @@ public:
   MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
   MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
 
-  bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+  void getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &, const CallBase &,
                           MachineFunction &MF,
                           unsigned IntrinsicID) const override;
 
@@ -555,7 +560,7 @@ public:
                            Register N1) const override;
 
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
-                       unsigned MaxDepth = 5) const;
+                       SDNodeFlags UserFlags = {}, unsigned MaxDepth = 5) const;
   bool isCanonicalized(Register Reg, const MachineFunction &MF,
                        unsigned MaxDepth = 5) const;
   bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
@@ -564,11 +569,12 @@ public:
   bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts,
                                     const SelectionDAG &DAG, bool SNaN = false,
                                     unsigned Depth = 0) const override;
-  AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
+  AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override;
   AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
   AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
   AtomicExpansionKind
-  shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+  shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override;
 
   void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
   void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced..1118675 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -42,6 +42,7 @@
 #include "llvm/TargetParser/TargetParser.h"
 
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
 #define DEBUG_TYPE "si-insert-waitcnts"
 
@@ -63,58 +64,96 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
     cl::desc("Force all waitcnt load counters to wait until 0"),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool> ExpertSchedulingModeFlag(
+    "amdgpu-expert-scheduling-mode",
+    cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
+    cl::init(false), cl::Hidden);
+
 namespace {
-// Class of object that encapsulates latest instruction counter score
-// associated with the operand.  Used for determining whether
-// s_waitcnt instruction needs to be emitted.
-
-enum InstCounterType {
-  LOAD_CNT = 0, // VMcnt prior to gfx12.
-  DS_CNT,       // LKGMcnt prior to gfx12.
-  EXP_CNT,      //
-  STORE_CNT,    // VScnt in gfx10/gfx11.
-  NUM_NORMAL_INST_CNTS,
-  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
-  BVH_CNT,                           // gfx12+ only.
-  KM_CNT,                            // gfx12+ only.
-  X_CNT,                             // gfx1250.
-  NUM_EXTENDED_INST_CNTS,
-  NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
-};
-} // namespace
+// Get the maximum wait count value for a given counter type.
+static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
+                                InstCounterType T) {
+  switch (T) {
+  case LOAD_CNT:
+    return Limits.LoadcntMax;
+  case DS_CNT:
+    return Limits.DscntMax;
+  case EXP_CNT:
+    return Limits.ExpcntMax;
+  case STORE_CNT:
+    return Limits.StorecntMax;
+  case SAMPLE_CNT:
+    return Limits.SamplecntMax;
+  case BVH_CNT:
+    return Limits.BvhcntMax;
+  case KM_CNT:
+    return Limits.KmcntMax;
+  case X_CNT:
+    return Limits.XcntMax;
+  case VA_VDST:
+    return Limits.VaVdstMax;
+  case VM_VSRC:
+    return Limits.VmVsrcMax;
+  default:
+    return 0;
+  }
+}
 
-namespace llvm {
-template <> struct enum_iteration_traits<InstCounterType> {
-  static constexpr bool is_iterable = true;
-};
-} // namespace llvm
+static bool isSoftXcnt(MachineInstr &MI) {
+  return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft;
+}
 
-namespace {
-// Return an iterator over all counters between LOAD_CNT (the first counter)
-// and \c MaxCounter (exclusive, default value yields an enumeration over
-// all counters).
-auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
-  return enum_seq(LOAD_CNT, MaxCounter);
+static bool isAtomicRMW(MachineInstr &MI) {
+  return (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) && MI.mayLoad() &&
+         MI.mayStore();
 }
 
-using RegInterval = std::pair<int, int>;
-
-struct HardwareLimits {
-  unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
-  unsigned ExpcntMax;
-  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
-  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
-  unsigned SamplecntMax; // gfx12+ only.
-  unsigned BvhcntMax;    // gfx12+ only.
-  unsigned KmcntMax;     // gfx12+ only.
-  unsigned XcntMax;      // gfx1250.
+enum class AtomicRMWState {
+  NewBlock,    // Start of a new atomic RMW block
+  InsideBlock, // Middle of an existing block
+  NotInBlock   // Not in an atomic RMW block
 };
 
+/// Integer IDs used to track vector memory locations we may have to wait on.
+/// Encoded as u16 chunks:
+///
+///   [0,               REGUNITS_END ): MCRegUnit
+///   [LDSDMA_BEGIN,    LDSDMA_END  ) : LDS DMA IDs
+///
+/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
+/// It gives (2 << 16) - 1 entries per category which is more than enough
+/// for all register units. MCPhysReg is u16 so we don't even support >u16
+/// physical register numbers at this time, let alone >u16 register units.
+/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
+/// is enough for all register units.
+using VMEMID = uint32_t;
+
+enum : VMEMID {
+  TRACKINGID_RANGE_LEN = (1 << 16),
+
+  // Important: MCRegUnits must always be tracked starting from 0, as we
+  // need to be able to convert between a MCRegUnit and a VMEMID freely.
+  REGUNITS_BEGIN = 0,
+  REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
+
+  // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
+  // entry, which is updated for all LDS DMA operations encountered.
+  // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
+  NUM_LDSDMA = TRACKINGID_RANGE_LEN,
+  LDSDMA_BEGIN = REGUNITS_END,
+  LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
+};
+
+/// Convert a MCRegUnit to a VMEMID.
+static constexpr VMEMID toVMEMID(MCRegUnit RU) {
+  return static_cast<unsigned>(RU);
+}
+
 #define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \
-  DECL(VMEM_ACCESS)              /* vmem read & write */                       \
-  DECL(VMEM_READ_ACCESS)         /* vmem read */                               \
+  DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */    \
   DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         \
   DECL(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */             \
+  DECL(GLOBAL_INV_ACCESS)        /* GLOBAL_INV (gfx12+ only) */                \
   DECL(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */          \
   DECL(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */          \
   DECL(VMEM_GROUP)               /* vmem group */                              \
@@ -129,7 +168,14 @@ struct HardwareLimits {
   DECL(EXP_POS_ACCESS)           /* write to export position */                \
   DECL(EXP_PARAM_ACCESS)         /* write to export parameter */               \
   DECL(VMW_GPR_LOCK)             /* vmem write holding on its data src */      \
-  DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */
+  DECL(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */       \
+  DECL(VGPR_CSMACC_WRITE)        /* write VGPR dest in Core/Side-MACC VALU */  \
+  DECL(VGPR_DPMACC_WRITE)        /* write VGPR dest in DPMACC VALU */          \
+  DECL(VGPR_TRANS_WRITE)         /* write VGPR dest in TRANS VALU */           \
+  DECL(VGPR_XDL_WRITE)           /* write VGPR dest in XDL VALU */             \
+  DECL(VGPR_LDS_READ)            /* read VGPR source in LDS */                 \
+  DECL(VGPR_FLAT_READ)           /* read VGPR source in FLAT */                \
+  DECL(VGPR_VMEM_READ)           /* read VGPR source in other VMEM */
 
 // clang-format off
 #define AMDGPU_EVENT_ENUM(Name) Name,
@@ -138,38 +184,33 @@ enum WaitEventType {
   NUM_WAIT_EVENTS
 };
 #undef AMDGPU_EVENT_ENUM
+} // namespace
+
+namespace llvm {
+template <> struct enum_iteration_traits<WaitEventType> {
+  static constexpr bool is_iterable = true;
+};
+} // namespace llvm
+
+namespace {
+
+/// Return an iterator over all events between VMEM_ACCESS (the first event)
+/// and \c MaxEvent (exclusive, default value yields an enumeration over
+/// all counters).
+auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
+  return enum_seq(VMEM_ACCESS, MaxEvent);
+}
 
 #define AMDGPU_EVENT_NAME(Name) #Name,
 static constexpr StringLiteral WaitEventTypeName[] = {
   AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
 };
 #undef AMDGPU_EVENT_NAME
+static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
+  return WaitEventTypeName[Event];
+}
 // clang-format on
 
-// The mapping is:
-//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
-//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
-//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
-//  NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS ..                    SCC
-// We reserve a fixed number of VGPR slots in the scoring tables for
-// special tokens like SCMEM_LDS (needed for buffer load to LDS).
-enum RegisterMapping {
-  SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
-  AGPR_OFFSET = 512,       // Maximum programmable ArchVGPRs across all targets.
-  SQ_MAX_PGM_SGPRS = 128,  // Maximum programmable SGPRs across all targets.
-  // Artificial register slots to track LDS writes into specific LDS locations
-  // if a location is known. When slots are exhausted or location is
-  // unknown use the first slot. The first slot is also always updated in
-  // addition to known location's slot to properly generate waits if dependent
-  // instruction's location is unknown.
-  FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
-  NUM_LDS_VGPRS = 9,                 // One more than the stores we track.
-  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
-  NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
-  // Remaining non-allocatable registers
-  SCC = NUM_ALL_ALLOCATABLE
-};
-
 // Enumerate different types of result-returning VMEM operations. Although
 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
 // s_waitcnt only instructions of the same VmemType are guaranteed to write
@@ -187,7 +228,7 @@ enum VmemType {
 
 // Maps values of InstCounterType to the instruction that waits on that
 // counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
-// returns true.
+// returns true, and does not cover VA_VDST or VM_VSRC.
 static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
     AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
     AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
@@ -224,49 +265,80 @@ VmemType getVmemType(const MachineInstr &Inst) {
   return VMEM_NOSAMPLER;
 }
 
-unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
-  switch (T) {
-  case LOAD_CNT:
-    return Wait.LoadCnt;
-  case EXP_CNT:
-    return Wait.ExpCnt;
-  case DS_CNT:
-    return Wait.DsCnt;
-  case STORE_CNT:
-    return Wait.StoreCnt;
-  case SAMPLE_CNT:
-    return Wait.SampleCnt;
-  case BVH_CNT:
-    return Wait.BvhCnt;
-  case KM_CNT:
-    return Wait.KmCnt;
-  case X_CNT:
-    return Wait.XCnt;
-  default:
-    llvm_unreachable("bad InstCounterType");
-  }
-}
-
 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
-  unsigned &WC = getCounterRef(Wait, T);
-  WC = std::min(WC, Count);
+  Wait.set(T, std::min(Wait.get(T), Count));
 }
 
-void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
-  getCounterRef(Wait, T) = ~0u;
-}
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { Wait.set(T, ~0u); }
 
-unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
-  return getCounterRef(Wait, T);
-}
+/// A small set of events.
+class WaitEventSet {
+  unsigned Mask = 0;
 
-// Mapping from event to counter according to the table masks.
-InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
-  for (auto T : inst_counter_types()) {
-    if (masks[T] & (1 << E))
-      return T;
+public:
+  WaitEventSet() = default;
+  explicit constexpr WaitEventSet(WaitEventType Event) {
+    static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
+                  "Not enough bits in Mask for all the events");
+    Mask |= 1 << Event;
+  }
+  constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
+    for (auto &E : Events) {
+      Mask |= 1 << E;
+    }
+  }
+  void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
+  void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
+  void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
+  bool contains(const WaitEventType &Event) const {
+    return Mask & (1 << Event);
+  }
+  /// \Returns true if this set contains all elements of \p Other.
+  bool contains(const WaitEventSet &Other) const {
+    return (~Mask & Other.Mask) == 0;
+  }
+  /// \Returns the intersection of this and \p Other.
+  WaitEventSet operator&(const WaitEventSet &Other) const {
+    auto Copy = *this;
+    Copy.Mask &= Other.Mask;
+    return Copy;
+  }
+  /// \Returns the union of this and \p Other.
+  WaitEventSet operator|(const WaitEventSet &Other) const {
+    auto Copy = *this;
+    Copy.Mask |= Other.Mask;
+    return Copy;
+  }
+  /// This set becomes the union of this and \p Other.
+  WaitEventSet &operator|=(const WaitEventSet &Other) {
+    Mask |= Other.Mask;
+    return *this;
+  }
+  /// This set becomes the intersection of this and \p Other.
+  WaitEventSet &operator&=(const WaitEventSet &Other) {
+    Mask &= Other.Mask;
+    return *this;
+  }
+  bool operator==(const WaitEventSet &Other) const {
+    return Mask == Other.Mask;
+  }
+  bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
+  bool empty() const { return Mask == 0; }
+  /// \Returns true if the set contains more than one element.
+  bool twoOrMore() const { return Mask & (Mask - 1); }
+  operator bool() const { return !empty(); }
+  void print(raw_ostream &OS) const {
+    ListSeparator LS(", ");
+    for (WaitEventType Event : wait_events()) {
+      OS << LS << getWaitEventTypeName(Event);
+    }
   }
-  llvm_unreachable("event type has no associated counter");
+  LLVM_DUMP_METHOD void dump() const;
+};
+
+void WaitEventSet::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
 }
 
 class WaitcntBrackets;
@@ -279,24 +351,33 @@ class WaitcntBrackets;
 // otherwise have had to become.
 class WaitcntGenerator {
 protected:
-  const GCNSubtarget *ST = nullptr;
-  const SIInstrInfo *TII = nullptr;
+  const GCNSubtarget &ST;
+  const SIInstrInfo &TII;
   AMDGPU::IsaVersion IV;
   InstCounterType MaxCounter;
   bool OptNone;
+  bool ExpandWaitcntProfiling = false;
+  const AMDGPU::HardwareLimits *Limits = nullptr;
 
 public:
-  WaitcntGenerator() = default;
-  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
-      : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
-        IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
+  WaitcntGenerator() = delete;
+  WaitcntGenerator(const WaitcntGenerator &) = delete;
+  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
+                   const AMDGPU::HardwareLimits *Limits)
+      : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+        IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
         OptNone(MF.getFunction().hasOptNone() ||
-                MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
+                MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
+        ExpandWaitcntProfiling(
+            MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
+        Limits(Limits) {}
 
   // Return true if the current function should be compiled with no
   // optimization.
   bool isOptNone() const { return OptNone; }
 
+  const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
+
   // Edits an existing sequence of wait count instructions according
   // to an incoming Waitcnt value, which is itself updated to reflect
   // any new wait count instructions which may need to be generated by
@@ -316,39 +397,51 @@ public:
   // Transform a soft waitcnt into a normal one.
   bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
 
-  // Generates new wait count instructions according to the  value of
+  // Generates new wait count instructions according to the value of
   // Wait, returning true if any new instructions were created.
+  // ScoreBrackets is used for profiling expansion.
   virtual bool createNewWaitcnt(MachineBasicBlock &Block,
                                 MachineBasicBlock::instr_iterator It,
-                                AMDGPU::Waitcnt Wait) = 0;
+                                AMDGPU::Waitcnt Wait,
+                                const WaitcntBrackets &ScoreBrackets) = 0;
 
-  // Returns an array of bit masks which can be used to map values in
-  // WaitEventType to corresponding counter values in InstCounterType.
-  virtual const unsigned *getWaitEventMask() const = 0;
+  // Returns the WaitEventSet that corresponds to counter \p T.
+  virtual const WaitEventSet &getWaitEvents(InstCounterType T) const = 0;
+
+  /// \returns the counter that corresponds to event \p E.
+  InstCounterType getCounterFromEvent(WaitEventType E) const {
+    for (auto T : inst_counter_types()) {
+      if (getWaitEvents(T).contains(E))
+        return T;
+    }
+    llvm_unreachable("event type has no associated counter");
+  }
 
   // Returns a new waitcnt with all counters except VScnt set to 0. If
   // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
   virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
 
   virtual ~WaitcntGenerator() = default;
-
-  // Create a mask value from the initializer list of wait event types.
-  static constexpr unsigned
-  eventMask(std::initializer_list<WaitEventType> Events) {
-    unsigned Mask = 0;
-    for (auto &E : Events)
-      Mask |= 1 << E;
-
-    return Mask;
-  }
 };
 
-class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
-public:
-  WaitcntGeneratorPreGFX12() = default;
-  WaitcntGeneratorPreGFX12(const MachineFunction &MF)
-      : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
+class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
+  static constexpr const WaitEventSet
+      WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+          WaitEventSet(
+              {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
+          WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+          WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+                        EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+          WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet()};
 
+public:
+  using WaitcntGenerator::WaitcntGenerator;
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
                           MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
@@ -356,35 +449,41 @@ public:
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
-
-  const unsigned *getWaitEventMask() const override {
-    assert(ST);
+                        AMDGPU::Waitcnt Wait,
+                        const WaitcntBrackets &ScoreBrackets) override;
 
-    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
-        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
-                   VMEM_BVH_READ_ACCESS}),
-        eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
-        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
-                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
-        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
-        0,
-        0,
-        0,
-        0};
-
-    return WaitEventMaskForInstPreGFX12;
+  const WaitEventSet &getWaitEvents(InstCounterType T) const override {
+    return WaitEventMaskForInstPreGFX12[T];
   }
 
   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
-class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
+class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
+protected:
+  bool IsExpertMode;
+  static constexpr const WaitEventSet
+      WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+          WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
+          WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
+          WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+                        EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+          WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+          WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
+          WaitEventSet({VMEM_BVH_READ_ACCESS}),
+          WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
+          WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
+          WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
+                        VGPR_XDL_WRITE}),
+          WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
+
 public:
-  WaitcntGeneratorGFX12Plus() = default;
+  WaitcntGeneratorGFX12Plus() = delete;
   WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
-                            InstCounterType MaxCounter)
-      : WaitcntGenerator(MF, MaxCounter) {}
+                            InstCounterType MaxCounter,
+                            const AMDGPU::HardwareLimits *Limits,
+                            bool IsExpertMode)
+      : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
 
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -393,28 +492,22 @@ public:
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
-
-  const unsigned *getWaitEventMask() const override {
-    assert(ST);
+                        AMDGPU::Waitcnt Wait,
+                        const WaitcntBrackets &ScoreBrackets) override;
 
-    static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
-        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
-        eventMask({LDS_ACCESS, GDS_ACCESS}),
-        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
-                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
-        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
-        eventMask({VMEM_SAMPLER_READ_ACCESS}),
-        eventMask({VMEM_BVH_READ_ACCESS}),
-        eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
-        eventMask({VMEM_GROUP, SMEM_GROUP})};
-
-    return WaitEventMaskForInstGFX12Plus;
+  const WaitEventSet &getWaitEvents(InstCounterType T) const override {
+    return WaitEventMaskForInstGFX12Plus[T];
   }
 
   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
+// Flags indicating which counters should be flushed in a loop preheader.
+struct PreheaderFlushFlags {
+  bool FlushVmCnt = false;
+  bool FlushDsCnt = false;
+};
+
 class SIInsertWaitcnts {
 public:
   const GCNSubtarget *ST;
@@ -423,11 +516,11 @@ public:
   const MachineRegisterInfo *MRI = nullptr;
   InstCounterType SmemAccessCounter;
   InstCounterType MaxCounter;
-  const unsigned *WaitEventMaskForInst;
+  bool IsExpertMode = false;
 
 private:
   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
-  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
+  DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
   MachineLoopInfo *MLI;
   MachinePostDominatorTree *PDT;
   AliasAnalysis *AA = nullptr;
@@ -441,19 +534,18 @@ private:
 
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
 
-  // In any given run of this pass, WCG will point to one of these two
-  // generator objects, which must have been re-initialised before use
-  // from a value made using a subtarget constructor.
-  WaitcntGeneratorPreGFX12 WCGPreGFX12;
-  WaitcntGeneratorGFX12Plus WCGGFX12Plus;
+  std::unique_ptr<WaitcntGenerator> WCG;
 
-  WaitcntGenerator *WCG = nullptr;
+  // Remember call and return instructions in the function.
+  DenseSet<MachineInstr *> CallInsts;
+  DenseSet<MachineInstr *> ReturnInsts;
 
-  // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
-  // message.
-  DenseSet<MachineInstr *> ReleaseVGPRInsts;
+  // Remember all S_ENDPGM instructions. The boolean flag is true if there might
+  // be outstanding stores but definitely no outstanding scratch stores, to help
+  // with insertion of DEALLOC_VGPRS messages.
+  DenseMap<MachineInstr *, bool> EndPgmInsts;
 
-  HardwareLimits Limits;
+  AMDGPU::HardwareLimits Limits;
 
 public:
   SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -464,34 +556,15 @@ public:
     (void)ForceVMCounter;
   }
 
-  unsigned getWaitCountMax(InstCounterType T) const {
-    switch (T) {
-    case LOAD_CNT:
-      return Limits.LoadcntMax;
-    case DS_CNT:
-      return Limits.DscntMax;
-    case EXP_CNT:
-      return Limits.ExpcntMax;
-    case STORE_CNT:
-      return Limits.StorecntMax;
-    case SAMPLE_CNT:
-      return Limits.SamplecntMax;
-    case BVH_CNT:
-      return Limits.BvhcntMax;
-    case KM_CNT:
-      return Limits.KmcntMax;
-    case X_CNT:
-      return Limits.XcntMax;
-    default:
-      break;
-    }
-    return 0;
-  }
+  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
 
-  bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
-  bool isPreheaderToFlush(MachineBasicBlock &MBB,
-                          const WaitcntBrackets &ScoreBrackets);
+  PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML,
+                                             const WaitcntBrackets &Brackets);
+  PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &MBB,
+                                         const WaitcntBrackets &ScoreBrackets);
   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
+  bool isDSRead(const MachineInstr &MI) const;
+  bool mayStoreIncrementingDSCNT(const MachineInstr &MI) const;
   bool run(MachineFunction &MF);
 
   void setForceEmitWaitcnt() {
@@ -524,6 +597,9 @@ public:
       ForceEmitWaitcnt[SAMPLE_CNT] = false;
       ForceEmitWaitcnt[BVH_CNT] = false;
     }
+
+    ForceEmitWaitcnt[VA_VDST] = false;
+    ForceEmitWaitcnt[VM_VSRC] = false;
 #endif // NDEBUG
   }
 
@@ -531,8 +607,10 @@ public:
   // instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     switch (Inst.getOpcode()) {
+    // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
     case AMDGPU::GLOBAL_INV:
-      return VMEM_READ_ACCESS; // tracked using loadcnt
+      return GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't write
+                                // VGPRs
     case AMDGPU::GLOBAL_WB:
     case AMDGPU::GLOBAL_WBINV:
       return VMEM_WRITE_ACCESS; // tracked using storecnt
@@ -542,7 +620,7 @@ public:
 
     // Maps VMEM access types to their corresponding WaitEventType.
     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
-        VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
+        VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
 
     assert(SIInstrInfo::isVMEM(Inst));
     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
@@ -551,22 +629,41 @@ public:
       return VMEM_ACCESS;
     if (Inst.mayStore() &&
         (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
-      // FLAT and SCRATCH instructions may access scratch. Other VMEM
-      // instructions do not.
-      if (TII->mayAccessScratchThroughFlat(Inst))
+      if (TII->mayAccessScratch(Inst))
         return SCRATCH_WRITE_ACCESS;
       return VMEM_WRITE_ACCESS;
     }
     if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
-      return VMEM_READ_ACCESS;
+      return VMEM_ACCESS;
     return VmemReadMapping[getVmemType(Inst)];
   }
 
+  std::optional<WaitEventType>
+  getExpertSchedulingEventType(const MachineInstr &Inst) const;
+
+  bool isAsync(const MachineInstr &MI) const {
+    if (!SIInstrInfo::isLDSDMA(MI))
+      return false;
+    if (SIInstrInfo::usesASYNC_CNT(MI))
+      return true;
+    const MachineOperand *Async =
+        TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync);
+    return Async && (Async->getImm());
+  }
+
+  bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
+    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
+  }
+
+  bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
+    return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
+  }
+
   bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr,
-                                 bool FlushVmCnt);
+                                 PreheaderFlushFlags FlushFlags);
   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
                        MachineBasicBlock::instr_iterator It,
                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -579,6 +676,16 @@ public:
                              WaitcntBrackets &ScoreBrackets);
   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
                             WaitcntBrackets &ScoreBrackets);
+  void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                         bool ExpertMode) const;
+  AtomicRMWState getAtomicRMWState(MachineInstr &MI,
+                                   AtomicRMWState PrevState) const;
+  const WaitEventSet &getWaitEvents(InstCounterType T) const {
+    return WCG->getWaitEvents(T);
+  }
+  InstCounterType getCounterFromEvent(WaitEventType E) const {
+    return WCG->getCounterFromEvent(E);
+  }
 };
 
 // This objects maintains the current score brackets of each wait counter, and
@@ -591,7 +698,30 @@ public:
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
+    assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
+  }
+
+#ifndef NDEBUG
+  ~WaitcntBrackets() {
+    unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
+    for (auto &[ID, Val] : VMem) {
+      if (Val.empty())
+        ++NumUnusedVmem;
+    }
+    for (auto &[ID, Val] : SGPRs) {
+      if (Val.empty())
+        ++NumUnusedSGPRs;
+    }
+
+    if (NumUnusedVmem || NumUnusedSGPRs) {
+      errs() << "WaitcntBracket had unused entries at destruction time: "
+             << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
+             << " SGPR unused entries\n";
+      std::abort();
+    }
+  }
+#endif
 
   bool isSmemCounter(InstCounterType T) const {
     return T == Context->SmemAccessCounter || T == X_CNT;
@@ -602,6 +732,18 @@ public:
     return T == X_CNT ? 1 : 0;
   }
 
+  unsigned getOutstanding(InstCounterType T) const {
+    return ScoreUBs[T] - ScoreLBs[T];
+  }
+
+  bool hasPendingVMEM(VMEMID ID, InstCounterType T) const {
+    return getVMemScore(ID, T) > getScoreLB(T);
+  }
+
+  /// \Return true if we have no score entries for counter \p T.
+  bool empty(InstCounterType T) const { return getScoreRange(T) == 0; }
+
+private:
   unsigned getScoreLB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
     return ScoreLBs[T];
@@ -616,53 +758,58 @@ public:
     return getScoreUB(T) - getScoreLB(T);
   }
 
-  unsigned getRegScore(int GprNo, InstCounterType T) const {
-    if (GprNo < NUM_ALL_VGPRS)
-      return VgprScores[T][GprNo];
-
-    if (GprNo < NUM_ALL_ALLOCATABLE)
-      return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+  unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
+    auto It = SGPRs.find(RU);
+    return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
+  }
 
-    assert(GprNo == SCC);
-    return SCCScore;
+  unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
+    auto It = VMem.find(TID);
+    return It != VMem.end() ? It->second.Scores[T] : 0;
   }
 
+public:
   bool merge(const WaitcntBrackets &Other);
 
-  RegInterval getRegInterval(const MachineInstr *MI,
-                             const MachineOperand &Op) const;
-
   bool counterOutOfOrder(InstCounterType T) const;
-  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
-  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
-  void determineWait(InstCounterType T, RegInterval Interval,
-                     AMDGPU::Waitcnt &Wait) const;
-  void determineWait(InstCounterType T, int RegNo,
-                     AMDGPU::Waitcnt &Wait) const {
-    determineWait(T, {RegNo, RegNo + 1}, Wait);
+  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+    simplifyWaitcnt(Wait, Wait);
   }
+  void simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
+                       AMDGPU::Waitcnt &UpdateWait) const;
+  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+  void simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
+                    AMDGPU::Waitcnt &UpdateWait) const;
+  void simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
+                      AMDGPU::Waitcnt &UpdateWait) const;
+
+  void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+                               AMDGPU::Waitcnt &Wait) const;
+  void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+                              AMDGPU::Waitcnt &Wait) const;
+  AMDGPU::Waitcnt determineAsyncWait(unsigned N);
   void tryClearSCCWriteEvent(MachineInstr *Inst);
 
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
   void applyWaitcnt(InstCounterType T, unsigned Count);
-  void applyXcnt(const AMDGPU::Waitcnt &Wait);
   void updateByEvent(WaitEventType E, MachineInstr &MI);
+  void recordAsyncMark(MachineInstr &MI);
 
-  unsigned hasPendingEvent() const { return PendingEvents; }
-  unsigned hasPendingEvent(WaitEventType E) const {
-    return PendingEvents & (1 << E);
+  bool hasPendingEvent() const { return !PendingEvents.empty(); }
+  bool hasPendingEvent(WaitEventType E) const {
+    return PendingEvents.contains(E);
   }
-  unsigned hasPendingEvent(InstCounterType T) const {
-    unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
-    assert((HasPending != 0) == (getScoreRange(T) != 0));
+  bool hasPendingEvent(InstCounterType T) const {
+    bool HasPending = PendingEvents & Context->getWaitEvents(T);
+    assert(HasPending == !empty(T) &&
+           "Expected pending events iff scoreboard is not empty");
     return HasPending;
   }
 
   bool hasMixedPendingEvents(InstCounterType T) const {
-    unsigned Events = hasPendingEvent(T);
+    WaitEventSet Events = PendingEvents & Context->getWaitEvents(T);
     // Return true if more than one bit is set in Events.
-    return Events & (Events - 1);
+    return Events.twoOrMore();
   }
 
   bool hasPendingFlat() const {
@@ -683,33 +830,36 @@ public:
 
   unsigned getPendingGDSWait() const {
     return std::min(getScoreUB(DS_CNT) - LastGDS,
-                    Context->getWaitCountMax(DS_CNT) - 1);
+                    getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
   }
 
   void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
 
   // Return true if there might be pending writes to the vgpr-interval by VMEM
   // instructions with types different from V.
-  bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      if (VgprVmemTypes[RegNo] & ~(1 << V))
+  bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
+    for (MCRegUnit RU : regunits(Reg)) {
+      auto It = VMem.find(toVMEMID(RU));
+      if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
         return true;
     }
     return false;
   }
 
-  void clearVgprVmemTypes(RegInterval Interval) {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      VgprVmemTypes[RegNo] = 0;
+  void clearVgprVmemTypes(MCPhysReg Reg) {
+    for (MCRegUnit RU : regunits(Reg)) {
+      if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
+        It->second.VMEMTypes = 0;
+        if (It->second.empty())
+          VMem.erase(It);
+      }
     }
   }
 
   void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT,
-               getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
-    PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
+    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
+                              getWaitCountMax(Context->getLimits(), STORE_CNT));
+    PendingEvents |= Context->getWaitEvents(STORE_CNT);
   }
 
   ArrayRef<const MachineInstr *> getLDSDMAStores() const {
@@ -718,11 +868,15 @@ public:
 
   bool hasPointSampleAccel(const MachineInstr &MI) const;
   bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
-                                      RegInterval Interval) const;
+                                      MCPhysReg RU) const;
 
   void print(raw_ostream &) const;
   void dump() const { print(dbgs()); }
 
+  // Free up memory by removing empty entries from the DenseMap that track event
+  // scores.
+  void purgeEmptyTrackingData();
+
 private:
   struct MergeInfo {
     unsigned OldLB;
@@ -730,8 +884,27 @@ private:
     unsigned MyShift;
     unsigned OtherShift;
   };
+
+  using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
+
+  void determineWaitForScore(InstCounterType T, unsigned Score,
+                             AMDGPU::Waitcnt &Wait) const;
+
   static bool mergeScore(const MergeInfo &M, unsigned &Score,
                          unsigned OtherScore);
+  bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
+                       ArrayRef<CounterValueArray> OtherMarks);
+
+  iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
+    assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
+    if (!Context->TRI->isInAllocatableClass(Reg))
+      return {{}, {}};
+    const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
+    unsigned Size = Context->TRI->getRegSizeInBits(*RC);
+    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
+      Reg = Context->TRI->get32BitRegister(Reg);
+    return Context->TRI->regunits(Reg);
+  }
 
   void setScoreLB(InstCounterType T, unsigned Val) {
     assert(T < NUM_INST_CNTS);
@@ -745,49 +918,95 @@ private:
     if (T != EXP_CNT)
       return;
 
-    if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
-      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
+    if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
+      ScoreLBs[EXP_CNT] =
+          ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
+  }
+
+  void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
+    const SIRegisterInfo *TRI = Context->TRI;
+    if (Reg == AMDGPU::SCC) {
+      SCCScore = Val;
+    } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
+      for (MCRegUnit RU : regunits(Reg))
+        VMem[toVMEMID(RU)].Scores[T] = Val;
+    } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
+      auto STy = getSgprScoresIdx(T);
+      for (MCRegUnit RU : regunits(Reg))
+        SGPRs[RU].Scores[STy] = Val;
+    } else {
+      llvm_unreachable("Register cannot be tracked/unknown register!");
+    }
   }
 
-  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
-    setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+  void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
+    VMem[TID].Scores[T] = Val;
   }
 
-  void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
-                          unsigned Score);
-
-  void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,
-                         InstCounterType CntTy, unsigned Val);
+  void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
+                         unsigned Val);
 
   const SIInsertWaitcnts *Context;
 
   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
-  unsigned PendingEvents = 0;
+  WaitEventSet PendingEvents;
   // Remember the last flat memory operation.
   unsigned LastFlat[NUM_INST_CNTS] = {0};
   // Remember the last GDS operation.
   unsigned LastGDS = 0;
-  // wait_cnt scores for every vgpr.
-  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
-  int VgprUB = -1;
-  int SgprUB = -1;
-  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
-  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
-  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
-  // X_CNT score.
-  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+
+  // The score tracking logic is fragmented as follows:
+  // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
+  // - SGPRs: SGPR RegUnits
+  // - SCC: Non-allocatable and not general purpose: not a SGPR.
+  //
+  // For the VMem case, if the key is within the range of LDS DMA IDs,
+  // then the corresponding index into the `LDSDMAStores` vector below is:
+  //   Key - LDSDMA_BEGIN - 1
+  // This is because LDSDMA_BEGIN is a generic entry and does not have an
+  // associated MachineInstr.
+  //
+  // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
+
+  struct VMEMInfo {
+    // Scores for all instruction counters. Zero-initialized.
+    CounterValueArray Scores{};
+    // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
+    unsigned VMEMTypes = 0;
+
+    bool empty() const { return all_of(Scores, equal_to(0)) && !VMEMTypes; }
+  };
+
+  struct SGPRInfo {
+    // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+    // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+    // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
+    // the X_CNT score.
+    std::array<unsigned, 2> Scores = {0};
+
+    bool empty() const { return !Scores[0] && !Scores[1]; }
+  };
+
+  DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
+  DenseMap<MCRegUnit, SGPRInfo> SGPRs;
+
   // Reg score for SCC.
   unsigned SCCScore = 0;
   // The unique instruction that has an SCC write pending, if there is one.
   const MachineInstr *PendingSCCWrite = nullptr;
-  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
-  // write to each vgpr.
-  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+
   // Store representative LDS DMA operations. The only useful info here is
   // alias info. One store is kept per unique AAInfo.
-  SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+  SmallVector<const MachineInstr *> LDSDMAStores;
+
+  // State of all counters at each async mark encountered so far.
+  SmallVector<CounterValueArray> AsyncMarks;
+  static constexpr unsigned MaxAsyncMarks = 16;
+
+  // Track the upper bound score for async operations that are not part of a
+  // mark yet. Initialized to all zeros.
+  CounterValueArray AsyncScore{};
 };
 
 class SIInsertWaitcntsLegacy : public MachineFunctionPass {
@@ -813,82 +1032,9 @@ public:
 
 } // end anonymous namespace
 
-RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
-                                            const MachineOperand &Op) const {
-  if (Op.getReg() == AMDGPU::SCC)
-    return {SCC, SCC + 1};
-
-  const SIRegisterInfo *TRI = Context->TRI;
-  const MachineRegisterInfo *MRI = Context->MRI;
-
-  if (!TRI->isInAllocatableClass(Op.getReg()))
-    return {-1, -1};
-
-  // A use via a PW operand does not need a waitcnt.
-  // A partial write is not a WAW.
-  assert(!Op.getSubReg() || !Op.isUndef());
-
-  RegInterval Result;
-
-  MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
-  unsigned RegIdx = TRI->getHWRegIndex(MCReg);
-
-  const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-
-  // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
-  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
-    unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
-    assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
-    Result.first = Reg;
-    if (TRI->isAGPR(*MRI, Op.getReg()))
-      Result.first += AGPR_OFFSET;
-    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
-    assert(Size % 16 == 0);
-    Result.second = Result.first + (Size / 16);
-
-    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
-      // Regardless of which lo16/hi16 is used, consider the full 32-bit
-      // register used.
-      if (AMDGPU::isHi16Reg(MCReg, *TRI))
-        Result.first -= 1;
-      else
-        Result.second += 1;
-    }
-  } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
-    // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
-    // sources like SRC_PRIVATE_BASE.
-    Result.first = RegIdx + NUM_ALL_VGPRS;
-    Result.second = Result.first + divideCeil(Size, 32);
-  } else {
-    return {-1, -1};
-  }
-
-  return Result;
-}
-
-void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
-                                         InstCounterType CntTy,
-                                         unsigned Score) {
-  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-    if (RegNo < NUM_ALL_VGPRS) {
-      VgprUB = std::max(VgprUB, RegNo);
-      VgprScores[CntTy][RegNo] = Score;
-    } else if (RegNo < NUM_ALL_ALLOCATABLE) {
-      SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
-      SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
-    } else {
-      assert(RegNo == SCC);
-      SCCScore = Score;
-    }
-  }
-}
-
-void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
-                                        const MachineOperand &Op,
+void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
                                         InstCounterType CntTy, unsigned Score) {
-  RegInterval Interval = getRegInterval(MI, Op);
-  setScoreByInterval(Interval, CntTy, Score);
+  setRegScore(Op.getReg().asMCReg(), CntTy, Score);
 }
 
 // Return true if the subtarget is one that enables Point Sample Acceleration
@@ -911,16 +1057,17 @@ bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
 // one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
 // (this is the type that a point sample accelerated instruction effectively
 // becomes)
-bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
-    const MachineInstr &MI, RegInterval Interval) const {
+bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+                                                     MCPhysReg Reg) const {
   if (!hasPointSampleAccel(MI))
     return false;
 
-  return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
+  return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
 }
 
 void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
-  InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
+  InstCounterType T = Context->getCounterFromEvent(E);
+  assert(T < Context->MaxCounter);
 
   unsigned UB = getScoreUB(T);
   unsigned CurrScore = UB + 1;
@@ -929,7 +1076,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
   // PendingEvents and ScoreUB need to be update regardless if this event
   // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  PendingEvents |= 1 << E;
+  PendingEvents.insert(E);
   setScoreUB(T, CurrScore);
 
   const SIRegisterInfo *TRI = Context->TRI;
@@ -943,57 +1090,52 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
       // All GDS operations must protect their address register (same as
       // export.)
       if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
-        setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
+        setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
 
       if (Inst.mayStore()) {
         if (const auto *Data0 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
-          setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
+          setScoreByOperand(*Data0, EXP_CNT, CurrScore);
         if (const auto *Data1 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
-          setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
+          setScoreByOperand(*Data1, EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (const MachineOperand &Op : Inst.all_uses()) {
           if (TRI->isVectorRegister(*MRI, Op.getReg()))
-            setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
+            setScoreByOperand(Op, EXP_CNT, CurrScore);
         }
       }
     } else if (TII->isFLAT(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isMIMG(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isMTBUF(Inst)) {
       if (Inst.mayStore())
-        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
     } else if (TII->isMUBUF(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isLDSDIR(Inst)) {
       // LDSDIR instructions attach the score to the destination.
-      setScoreByOperand(&Inst,
-                        *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
+      setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
                         EXP_CNT, CurrScore);
     } else {
       if (TII->isEXP(Inst)) {
@@ -1003,27 +1145,37 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
         // score.
         for (MachineOperand &DefMO : Inst.all_defs()) {
           if (TRI->isVGPR(*MRI, DefMO.getReg())) {
-            setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
+            setScoreByOperand(DefMO, EXP_CNT, CurrScore);
           }
         }
       }
       for (const MachineOperand &Op : Inst.all_uses()) {
         if (TRI->isVectorRegister(*MRI, Op.getReg()))
-          setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
+          setScoreByOperand(Op, EXP_CNT, CurrScore);
       }
     }
   } else if (T == X_CNT) {
     WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
-    if (PendingEvents & (1 << OtherEvent)) {
+    if (PendingEvents.contains(OtherEvent)) {
       // Hardware inserts an implicit xcnt between interleaved
       // SMEM and VMEM operations. So there will never be
       // outstanding address translations for both SMEM and
       // VMEM at the same time.
       setScoreLB(T, getScoreUB(T) - 1);
-      PendingEvents &= ~(1 << OtherEvent);
+      PendingEvents.remove(OtherEvent);
     }
     for (const MachineOperand &Op : Inst.all_uses())
-      setScoreByOperand(&Inst, Op, T, CurrScore);
+      setScoreByOperand(Op, T, CurrScore);
+  } else if (T == VA_VDST || T == VM_VSRC) {
+    // Match the score to the VGPR destination or source registers as
+    // appropriate
+    for (const MachineOperand &Op : Inst.operands()) {
+      if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
+          (T == VM_VSRC && Op.isDef()))
+        continue;
+      if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
+        setScoreByOperand(Op, T, CurrScore);
+    }
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
     // Match the score to the destination registers.
     //
@@ -1035,9 +1187,8 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
     // Special cases where implicit register defs exists, such as M0 or VCC,
     // but none with memory instructions.
     for (const MachineOperand &Op : Inst.defs()) {
-      RegInterval Interval = getRegInterval(&Inst, Op);
       if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
-        if (Interval.first >= NUM_ALL_VGPRS)
+        if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
           continue;
         if (updateVMCntOnly(Inst)) {
           // updateVMCntOnly should only leave us with VGPRs
@@ -1050,16 +1201,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
           // this with another potential dependency
           if (hasPointSampleAccel(Inst))
             TypesMask |= 1 << VMEM_NOSAMPLER;
-          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
-            VgprVmemTypes[RegNo] |= TypesMask;
+          for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
+            VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
         }
       }
-      setScoreByInterval(Interval, T, CurrScore);
+      setScoreByOperand(Op, T, CurrScore);
     }
     if (Inst.mayStore() &&
-        (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+        (TII->isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
       // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
       // written can be accessed. A load from LDS to VMEM does not need a wait.
+      //
+      // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
+      // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
+      // store. The "Slot" is the index into LDSDMAStores + 1.
       unsigned Slot = 0;
       for (const auto *MemOp : Inst.memoperands()) {
         if (!MemOp->isStore() ||
@@ -1072,9 +1227,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
         // original memory object and practically produced in the module LDS
         // lowering pass. If there is no scope available we will not be able
         // to disambiguate LDS aliasing as after the module lowering all LDS
-        // is squashed into a single big object. Do not attempt to use one of
-        // the limited LDSDMAStores for something we will not be able to use
-        // anyway.
+        // is squashed into a single big object.
         if (!AAI || !AAI.Scope)
           break;
         for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
@@ -1085,61 +1238,93 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
             }
           }
         }
-        if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
+        if (Slot)
           break;
+        // The slot may not be valid because it can be >= NUM_LDSDMA which
+        // means the scoreboard cannot track it. We still want to preserve the
+        // MI in order to check alias information, though.
         LDSDMAStores.push_back(&Inst);
         Slot = LDSDMAStores.size();
         break;
       }
-      setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
-      if (Slot)
-        setRegScore(FIRST_LDS_VGPR, T, CurrScore);
+      setVMemScore(LDSDMA_BEGIN, T, CurrScore);
+      if (Slot && Slot < NUM_LDSDMA)
+        setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
+    }
+
+    // FIXME: Not supported on GFX12 yet. Newer async operations use other
+    // counters too, so will need a map from instruction or event types to
+    // counter types.
+    if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
+      assert(!SIInstrInfo::usesASYNC_CNT(Inst));
+      AsyncScore[T] = CurrScore;
     }
 
     if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
-      setRegScore(SCC, T, CurrScore);
+      setRegScore(AMDGPU::SCC, T, CurrScore);
       PendingSCCWrite = &Inst;
     }
   }
 }
 
+void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
+  // In the absence of loops, AsyncMarks can grow linearly with the program
+  // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
+  // limit every time we push a new mark, but that seems like unnecessary work
+  // in practical cases. We do separately truncate the array when processing a
+  // loop, which should be sufficient.
+  AsyncMarks.push_back(AsyncScore);
+  AsyncScore = {};
+  LLVM_DEBUG({
+    dbgs() << "recordAsyncMark:\n" << Inst;
+    for (const auto &Mark : AsyncMarks) {
+      llvm::interleaveComma(Mark, dbgs());
+      dbgs() << '\n';
+    }
+  });
+}
+
 void WaitcntBrackets::print(raw_ostream &OS) const {
   const GCNSubtarget *ST = Context->ST;
 
-  OS << '\n';
   for (auto T : inst_counter_types(Context->MaxCounter)) {
     unsigned SR = getScoreRange(T);
-
     switch (T) {
     case LOAD_CNT:
       OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
-         << SR << "): ";
+         << SR << "):";
       break;
     case DS_CNT:
       OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
-         << SR << "): ";
+         << SR << "):";
       break;
     case EXP_CNT:
-      OS << "    EXP_CNT(" << SR << "): ";
+      OS << "    EXP_CNT(" << SR << "):";
       break;
     case STORE_CNT:
       OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
-         << SR << "): ";
+         << SR << "):";
       break;
     case SAMPLE_CNT:
-      OS << "    SAMPLE_CNT(" << SR << "): ";
+      OS << "    SAMPLE_CNT(" << SR << "):";
       break;
     case BVH_CNT:
-      OS << "    BVH_CNT(" << SR << "): ";
+      OS << "    BVH_CNT(" << SR << "):";
       break;
     case KM_CNT:
-      OS << "    KM_CNT(" << SR << "): ";
+      OS << "    KM_CNT(" << SR << "):";
       break;
     case X_CNT:
-      OS << "    X_CNT(" << SR << "): ";
+      OS << "    X_CNT(" << SR << "):";
+      break;
+    case VA_VDST:
+      OS << "    VA_VDST(" << SR << "): ";
+      break;
+    case VM_VSRC:
+      OS << "    VM_VSRC(" << SR << "): ";
       break;
     default:
-      OS << "    UNKNOWN(" << SR << "): ";
+      OS << "    UNKNOWN(" << SR << "):";
       break;
     }
 
@@ -1147,29 +1332,38 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
       // Print vgpr scores.
       unsigned LB = getScoreLB(T);
 
-      for (int J = 0; J <= VgprUB; J++) {
-        unsigned RegScore = getRegScore(J, T);
+      SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
+      sort(SortedVMEMIDs);
+
+      for (auto ID : SortedVMEMIDs) {
+        unsigned RegScore = VMem.at(ID).Scores[T];
         if (RegScore <= LB)
           continue;
         unsigned RelScore = RegScore - LB - 1;
-        if (J < FIRST_LDS_VGPR) {
-          OS << RelScore << ":v" << J << " ";
+        if (ID < REGUNITS_END) {
+          OS << ' ' << RelScore << ":vRU" << ID;
         } else {
-          OS << RelScore << ":ds ";
+          assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
+                 "Unhandled/unexpected ID value!");
+          OS << ' ' << RelScore << ":LDSDMA" << ID;
         }
       }
+
       // Also need to print sgpr scores for lgkm_cnt or xcnt.
       if (isSmemCounter(T)) {
-        for (int J = 0; J <= SgprUB; J++) {
-          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
+        SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
+        sort(SortedSMEMIDs);
+        for (auto ID : SortedSMEMIDs) {
+          unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
           if (RegScore <= LB)
             continue;
           unsigned RelScore = RegScore - LB - 1;
-          OS << RelScore << ":s" << J << " ";
+          OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
         }
       }
+
       if (T == KM_CNT && SCCScore > 0)
-        OS << SCCScore << ":scc ";
+        OS << ' ' << SCCScore << ":scc";
     }
     OS << '\n';
   }
@@ -1187,20 +1381,70 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
   }
   OS << '\n';
 
+  OS << "Async score: ";
+  if (AsyncScore.empty())
+    OS << "none";
+  else
+    llvm::interleaveComma(AsyncScore, OS);
+  OS << '\n';
+
+  OS << "Async marks: " << AsyncMarks.size() << '\n';
+
+  for (const auto &Mark : AsyncMarks) {
+    for (auto T : inst_counter_types()) {
+      unsigned MarkedScore = Mark[T];
+      switch (T) {
+      case LOAD_CNT:
+        OS << "  " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM")
+           << "_CNT: " << MarkedScore;
+        break;
+      case DS_CNT:
+        OS << "  " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM")
+           << "_CNT: " << MarkedScore;
+        break;
+      case EXP_CNT:
+        OS << "  EXP_CNT: " << MarkedScore;
+        break;
+      case STORE_CNT:
+        OS << "  " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS")
+           << "_CNT: " << MarkedScore;
+        break;
+      case SAMPLE_CNT:
+        OS << "  SAMPLE_CNT: " << MarkedScore;
+        break;
+      case BVH_CNT:
+        OS << "  BVH_CNT: " << MarkedScore;
+        break;
+      case KM_CNT:
+        OS << "  KM_CNT: " << MarkedScore;
+        break;
+      case X_CNT:
+        OS << "  X_CNT: " << MarkedScore;
+        break;
+      default:
+        OS << "  UNKNOWN: " << MarkedScore;
+        break;
+      }
+    }
+    OS << '\n';
+  }
   OS << '\n';
 }
 
-/// Simplify the waitcnt, in the sense of removing redundant counts, and return
-/// whether a waitcnt instruction is needed at all.
-void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
-  simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
-  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
-  simplifyWaitcnt(DS_CNT, Wait.DsCnt);
-  simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
-  simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
-  simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
-  simplifyWaitcnt(KM_CNT, Wait.KmCnt);
-  simplifyWaitcnt(X_CNT, Wait.XCnt);
+/// Simplify \p UpdateWait by removing waits that are redundant based on the
+/// current WaitcntBrackets and any other waits specified in \p CheckWait.
+void WaitcntBrackets::simplifyWaitcnt(const AMDGPU::Waitcnt &CheckWait,
+                                      AMDGPU::Waitcnt &UpdateWait) const {
+  simplifyWaitcnt(LOAD_CNT, UpdateWait.LoadCnt);
+  simplifyWaitcnt(EXP_CNT, UpdateWait.ExpCnt);
+  simplifyWaitcnt(DS_CNT, UpdateWait.DsCnt);
+  simplifyWaitcnt(STORE_CNT, UpdateWait.StoreCnt);
+  simplifyWaitcnt(SAMPLE_CNT, UpdateWait.SampleCnt);
+  simplifyWaitcnt(BVH_CNT, UpdateWait.BvhCnt);
+  simplifyWaitcnt(KM_CNT, UpdateWait.KmCnt);
+  simplifyXcnt(CheckWait, UpdateWait);
+  simplifyWaitcnt(VA_VDST, UpdateWait.VaVdst);
+  simplifyVmVsrc(CheckWait, UpdateWait);
 }
 
 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -1212,52 +1456,155 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
     Count = ~0u;
 }
 
-void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
-                                    AMDGPU::Waitcnt &Wait) const {
+void WaitcntBrackets::simplifyXcnt(const AMDGPU::Waitcnt &CheckWait,
+                                   AMDGPU::Waitcnt &UpdateWait) const {
+  // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
+  // optimizations. On entry to a block with multiple predescessors, there may
+  // be pending SMEM and VMEM events active at the same time.
+  // In such cases, only clear one active event at a time.
+  // TODO: Revisit xcnt optimizations for gfx1250.
+  // Wait on XCNT is redundant if we are already waiting for a load to complete.
+  // SMEM can return out of order, so only omit XCNT wait if we are waiting till
+  // zero.
+  if (CheckWait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
+    UpdateWait.XCnt = ~0u;
+  // If we have pending store we cannot optimize XCnt because we do not wait for
+  // stores. VMEM loads retun in order, so if we only have loads XCnt is
+  // decremented to the same number as LOADCnt.
+  if (CheckWait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
+      !hasPendingEvent(STORE_CNT) && CheckWait.XCnt >= CheckWait.LoadCnt)
+    UpdateWait.XCnt = ~0u;
+  simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
+}
+
+void WaitcntBrackets::simplifyVmVsrc(const AMDGPU::Waitcnt &CheckWait,
+                                     AMDGPU::Waitcnt &UpdateWait) const {
+  // Waiting for some counters implies waiting for VM_VSRC, since an
+  // instruction that decrements a counter on completion would have
+  // decremented VM_VSRC once its VGPR operands had been read.
+  if (CheckWait.VmVsrc >=
+      std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
+                CheckWait.BvhCnt, CheckWait.DsCnt}))
+    UpdateWait.VmVsrc = ~0u;
+  simplifyWaitcnt(VM_VSRC, UpdateWait.VmVsrc);
+}
+
+void WaitcntBrackets::purgeEmptyTrackingData() {
+  for (auto &[K, V] : make_early_inc_range(VMem)) {
+    if (V.empty())
+      VMem.erase(K);
+  }
+  for (auto &[K, V] : make_early_inc_range(SGPRs)) {
+    if (V.empty())
+      SGPRs.erase(K);
+  }
+}
+
+void WaitcntBrackets::determineWaitForScore(InstCounterType T,
+                                            unsigned ScoreToWait,
+                                            AMDGPU::Waitcnt &Wait) const {
   const unsigned LB = getScoreLB(T);
   const unsigned UB = getScoreUB(T);
-  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-    unsigned ScoreToWait = getRegScore(RegNo, T);
-
-    // If the score of src_operand falls within the bracket, we need an
-    // s_waitcnt instruction.
-    if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-      if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
-          !Context->ST->hasFlatLgkmVMemCountInOrder()) {
-        // If there is a pending FLAT operation, and this is a VMem or LGKM
-        // waitcnt and the target can report early completion, then we need
-        // to force a waitcnt 0.
-        addWait(Wait, T, 0);
-      } else if (counterOutOfOrder(T)) {
-        // Counter can get decremented out-of-order when there
-        // are multiple types event in the bracket. Also emit an s_wait counter
-        // with a conservative value of 0 for the counter.
-        addWait(Wait, T, 0);
-      } else {
-        // If a counter has been maxed out avoid overflow by waiting for
-        // MAX(CounterType) - 1 instead.
-        unsigned NeededWait =
-            std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
-        addWait(Wait, T, NeededWait);
-      }
+
+  // If the score falls within the bracket, we need a waitcnt.
+  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
+        !Context->ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
+      addWait(Wait, T, 0);
+    } else if (counterOutOfOrder(T)) {
+      // Counter can get decremented out-of-order when there
+      // are multiple types event in the bracket. Also emit an s_wait counter
+      // with a conservative value of 0 for the counter.
+      addWait(Wait, T, 0);
+    } else {
+      // If a counter has been maxed out avoid overflow by waiting for
+      // MAX(CounterType) - 1 instead.
+      unsigned NeededWait = std::min(
+          UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
+      addWait(Wait, T, NeededWait);
     }
   }
 }
 
+AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
+  LLVM_DEBUG({
+    dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
+           << ":\n";
+    for (const auto &Mark : AsyncMarks) {
+      llvm::interleaveComma(Mark, dbgs());
+      dbgs() << '\n';
+    }
+  });
+
+  AMDGPU::Waitcnt Wait;
+  if (AsyncMarks.size() == MaxAsyncMarks) {
+    // Enforcing MaxAsyncMarks here is unnecessary work because the size of
+    // MaxAsyncMarks is linear when traversing straightline code. But we do
+    // need to check if truncation may have occured at a merge, and adjust N
+    // to ensure that a wait is generated.
+    LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
+    N = std::min(N, (unsigned)MaxAsyncMarks - 1);
+  }
+
+  if (AsyncMarks.size() <= N) {
+    LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
+    return Wait;
+  }
+
+  size_t MarkIndex = AsyncMarks.size() - N - 1;
+  const auto &RequiredMark = AsyncMarks[MarkIndex];
+  for (InstCounterType T : inst_counter_types())
+    determineWaitForScore(T, RequiredMark[T], Wait);
+
+  // Immediately remove the waited mark and all older ones
+  // This happens BEFORE the wait is actually inserted, which is fine
+  // because we've already extracted the wait requirements
+  LLVM_DEBUG({
+    dbgs() << "Removing " << (MarkIndex + 1)
+           << " async marks after determining wait\n";
+  });
+  AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
+
+  LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
+  return Wait;
+}
+
+void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+                                              AMDGPU::Waitcnt &Wait) const {
+  if (Reg == AMDGPU::SCC) {
+    determineWaitForScore(T, SCCScore, Wait);
+  } else {
+    bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
+    for (MCRegUnit RU : regunits(Reg))
+      determineWaitForScore(
+          T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
+          Wait);
+  }
+}
+
+void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+                                             AMDGPU::Waitcnt &Wait) const {
+  assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
+  determineWaitForScore(T, getVMemScore(TID, T), Wait);
+}
+
 void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
   // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
   // SCC has landed
   if (PendingSCCWrite &&
       PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
       PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
-    unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
+    WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
     // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
-    if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
+    if ((PendingEvents & Context->getWaitEvents(KM_CNT)) ==
         SCC_WRITE_PendingEvent) {
       setScoreLB(KM_CNT, getScoreUB(KM_CNT));
     }
 
-    PendingEvents &= ~SCC_WRITE_PendingEvent;
+    PendingEvents.remove(SCC_WRITE_PendingEvent);
     PendingSCCWrite = nullptr;
   }
 }
@@ -1270,7 +1617,9 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
   applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
   applyWaitcnt(BVH_CNT, Wait.BvhCnt);
   applyWaitcnt(KM_CNT, Wait.KmCnt);
-  applyXcnt(Wait);
+  applyWaitcnt(X_CNT, Wait.XCnt);
+  applyWaitcnt(VA_VDST, Wait.VaVdst);
+  applyWaitcnt(VM_VSRC, Wait.VmVsrc);
 }
 
 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -1283,25 +1632,22 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
     setScoreLB(T, UB);
-    PendingEvents &= ~Context->WaitEventMaskForInst[T];
+    PendingEvents.remove(Context->getWaitEvents(T));
   }
-}
-
-void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
-  // Wait on XCNT is redundant if we are already waiting for a load to complete.
-  // SMEM can return out of order, so only omit XCNT wait if we are waiting till
-  // zero.
-  if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
-    return applyWaitcnt(X_CNT, 0);
-
-  // If we have pending store we cannot optimize XCnt because we do not wait for
-  // stores. VMEM loads retun in order, so if we only have loads XCnt is
-  // decremented to the same number as LOADCnt.
-  if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
-      !hasPendingEvent(STORE_CNT))
-    return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
 
-  applyWaitcnt(X_CNT, Wait.XCnt);
+  if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
+    if (!hasMixedPendingEvents(X_CNT))
+      applyWaitcnt(X_CNT, 0);
+    else
+      PendingEvents.remove(SMEM_GROUP);
+  }
+  if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
+      !hasPendingEvent(STORE_CNT)) {
+    if (!hasMixedPendingEvents(X_CNT))
+      applyWaitcnt(X_CNT, Count);
+    else if (Count == 0)
+      PendingEvents.remove(VMEM_GROUP);
+  }
 }
 
 // Where there are multiple types of event in the bracket of a counter,
@@ -1311,6 +1657,20 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
       (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
     return true;
+
+  // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
+  // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
+  // out-of-order completion.
+  if (T == LOAD_CNT) {
+    unsigned Events = hasPendingEvent(T);
+    // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
+    // events
+    Events &= ~(1 << GLOBAL_INV_ACCESS);
+    // Return true only if there are still multiple event types after removing
+    // GLOBAL_INV
+    return Events & (Events - 1);
+  }
+
   return hasMixedPendingEvents(T);
 }
 
@@ -1373,7 +1733,7 @@ bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
   if (Opcode == Waitcnt->getOpcode())
     return false;
 
-  Waitcnt->setDesc(TII->get(Opcode));
+  Waitcnt->setDesc(TII.get(Opcode));
   return true;
 }
 
@@ -1385,7 +1745,6 @@ bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
 bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
-  assert(ST);
   assert(isNormalMode(MaxCounter));
 
   bool Modified = false;
@@ -1394,7 +1753,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 
   LLVM_DEBUG({
     dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
-    if (It == OldWaitcntInstr.getParent()->instr_end())
+    if (It.isEnd())
       dbgs() << "end of block\n";
     else
       dbgs() << *It;
@@ -1427,11 +1786,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
       } else
         WaitcntInstr = &II;
     } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
-      assert(ST->hasVMemToLDSLoad());
+      assert(ST.hasVMemToLDSLoad());
       LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
-                        << "Before: " << Wait.LoadCnt << '\n';);
-      ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
-      LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+                        << "Before: " << Wait << '\n';);
+      ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
+      LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
 
       // It is possible (but unlikely) that this is the only wait instruction,
       // in which case, we exit this loop without a WaitcntInstr to consume
@@ -1440,12 +1799,17 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
       // possibility in an articial MIR test since such a situation cannot be
       // recreated by running the memory legalizer.
       II.eraseFromParent();
+    } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
+      unsigned N = II.getOperand(0).getImm();
+      LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
+      AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
+      Wait = Wait.combined(OldWait);
     } else {
       assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
       assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
 
       unsigned OldVSCnt =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       if (TrySimplify)
         ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
       Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
@@ -1470,13 +1834,12 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     Wait.ExpCnt = ~0u;
     Wait.DsCnt = ~0u;
 
-    LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
-                   ? dbgs()
-                         << "applied pre-existing waitcnt\n"
-                         << "New Instr at block end: " << *WaitcntInstr << '\n'
-                   : dbgs() << "applied pre-existing waitcnt\n"
-                            << "Old Instr: " << *It
-                            << "New Instr: " << *WaitcntInstr << '\n');
+    LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
+                                   << "New Instr at block end: "
+                                   << *WaitcntInstr << '\n'
+                          : dbgs() << "applied pre-existing waitcnt\n"
+                                   << "Old Instr: " << *It
+                                   << "New Instr: " << *WaitcntInstr << '\n');
   }
 
   if (WaitcntVsCntInstr) {
@@ -1487,7 +1850,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
     Wait.StoreCnt = ~0u;
 
-    LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
+    LLVM_DEBUG(It.isEnd()
                    ? dbgs() << "applied pre-existing waitcnt\n"
                             << "New Instr at block end: " << *WaitcntVsCntInstr
                             << '\n'
@@ -1503,38 +1866,100 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 /// required counters in \p Wait
 bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
-  assert(ST);
+    AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
   assert(isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
+  // Helper to emit expanded waitcnt sequence for profiling.
+  // Emits waitcnts from (Outstanding-1) down to Target.
+  // The EmitWaitcnt callback emits a single waitcnt.
+  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+                                 auto EmitWaitcnt) {
+    do {
+      EmitWaitcnt(--Outstanding);
+    } while (Outstanding > Target);
+    Modified = true;
+  };
+
   // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
   // single instruction while VScnt has its own instruction.
   if (Wait.hasWaitExceptStoreCnt()) {
-    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-    [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-    Modified = true;
+    // If profiling expansion is enabled, emit an expanded sequence
+    if (ExpandWaitcntProfiling) {
+      // Check if any of the counters to be waited on are out-of-order.
+      // If so, fall back to normal (non-expanded) behavior since expansion
+      // would provide misleading profiling information.
+      bool AnyOutOfOrder = false;
+      for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+        unsigned WaitCnt = Wait.get(CT);
+        if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
+          AnyOutOfOrder = true;
+          break;
+        }
+      }
 
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+      if (AnyOutOfOrder) {
+        // Fall back to non-expanded wait
+        unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+        BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
+        Modified = true;
+      } else {
+        // All counters are in-order, safe to expand
+        for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+          unsigned WaitCnt = Wait.get(CT);
+          if (WaitCnt == ~0u)
+            continue;
+
+          unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
+                                          getWaitCountMax(getLimits(), CT) - 1);
+          EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
+            AMDGPU::Waitcnt W;
+            W.set(CT, Count);
+            BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT))
+                .addImm(AMDGPU::encodeWaitcnt(IV, W));
+          });
+        }
+      }
+    } else {
+      // Normal behavior: emit single combined waitcnt
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT)).addImm(Enc);
+      Modified = true;
+
+      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   if (Wait.hasWaitStoreCnt()) {
-    assert(ST->hasVscnt());
-
-    [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+    assert(ST.hasVscnt());
+
+    if (ExpandWaitcntProfiling && Wait.StoreCnt != ~0u &&
+        !ScoreBrackets.counterOutOfOrder(STORE_CNT)) {
+      // Only expand if counter is not out-of-order
+      unsigned Outstanding =
+          std::min(ScoreBrackets.getOutstanding(STORE_CNT),
+                   getWaitCountMax(getLimits(), STORE_CNT) - 1);
+      EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
+        BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-            .addImm(Wait.StoreCnt);
-    Modified = true;
+            .addImm(Count);
+      });
+    } else {
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
+              .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+              .addImm(Wait.StoreCnt);
+      Modified = true;
 
-    LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   return Modified;
@@ -1542,13 +1967,14 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
 
 AMDGPU::Waitcnt
 WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
-  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST.hasVscnt() ? 0 : ~0u);
 }
 
 AMDGPU::Waitcnt
 WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+  unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
   return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
-                         ~0u /* XCNT */);
+                         ~0u /* XCNT */, ExpertVal, ExpertVal);
 }
 
 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -1558,22 +1984,25 @@ WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
 bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
     AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
-  assert(ST);
   assert(!isNormalMode(MaxCounter));
 
   bool Modified = false;
   MachineInstr *CombinedLoadDsCntInstr = nullptr;
   MachineInstr *CombinedStoreDsCntInstr = nullptr;
+  MachineInstr *WaitcntDepctrInstr = nullptr;
   MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
 
   LLVM_DEBUG({
     dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
-    if (It == OldWaitcntInstr.getParent()->instr_end())
+    if (It.isEnd())
       dbgs() << "end of block\n";
     else
       dbgs() << *It;
   });
 
+  // Accumulate waits that should not be simplified.
+  AMDGPU::Waitcnt RequiredWait;
+
   for (auto &II :
        make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
     LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
@@ -1597,45 +2026,81 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
 
     if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
       unsigned OldEnc =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
       if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(OldWait);
-      Wait = Wait.combined(OldWait);
+        Wait = Wait.combined(OldWait);
+      else
+        RequiredWait = RequiredWait.combined(OldWait);
       UpdatableInstr = &CombinedLoadDsCntInstr;
     } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
       unsigned OldEnc =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
       if (TrySimplify)
+        Wait = Wait.combined(OldWait);
+      else
+        RequiredWait = RequiredWait.combined(OldWait);
+      UpdatableInstr = &CombinedStoreDsCntInstr;
+    } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
+      unsigned OldEnc =
+          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      AMDGPU::Waitcnt OldWait;
+      OldWait.VaVdst = AMDGPU::DepCtr::decodeFieldVaVdst(OldEnc);
+      OldWait.VmVsrc = AMDGPU::DepCtr::decodeFieldVmVsrc(OldEnc);
+      if (TrySimplify)
         ScoreBrackets.simplifyWaitcnt(OldWait);
       Wait = Wait.combined(OldWait);
-      UpdatableInstr = &CombinedStoreDsCntInstr;
+      UpdatableInstr = &WaitcntDepctrInstr;
     } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
       // Architectures higher than GFX10 do not have direct loads to
       // LDS, so no work required here yet.
       II.eraseFromParent();
       continue;
+    } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
+      reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet");
     } else {
       std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
       assert(CT.has_value());
       unsigned OldCnt =
-          TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+          TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
       if (TrySimplify)
-        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
-      addWait(Wait, CT.value(), OldCnt);
+        addWait(Wait, CT.value(), OldCnt);
+      else
+        addWait(RequiredWait, CT.value(), OldCnt);
       UpdatableInstr = &WaitInstrs[CT.value()];
     }
 
     // Merge consecutive waitcnt of the same type by erasing multiples.
     if (!*UpdatableInstr) {
       *UpdatableInstr = &II;
+    } else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
+      // S_WAITCNT_DEPCTR requires special care. Don't remove a
+      // duplicate if it is waiting on things other than VA_VDST or
+      // VM_VSRC. If that is the case, just make sure the VA_VDST and
+      // VM_VSRC subfields of the operand are set to the "no wait"
+      // values.
+
+      unsigned Enc = TII.getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+      Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, ~0u);
+      Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, ~0u);
+
+      if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
+        Modified |= updateOperandIfDifferent(II, AMDGPU::OpName::simm16, Enc);
+        Modified |= promoteSoftWaitCnt(&II);
+      } else {
+        II.eraseFromParent();
+        Modified = true;
+      }
     } else {
       II.eraseFromParent();
       Modified = true;
     }
   }
 
+  ScoreBrackets.simplifyWaitcnt(Wait.combined(RequiredWait), Wait);
+  Wait = Wait.combined(RequiredWait);
+
   if (CombinedLoadDsCntInstr) {
     // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
     // to be waited for. Otherwise, let the instruction be deleted so
@@ -1644,6 +2109,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     // createNewWaitcnt(). As a side effect, resetting the wait counts will
     // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
     // the loop below that deals with single counter instructions.
+    //
+    // A wait for LOAD_CNT or DS_CNT implies a wait for VM_VSRC, since
+    // instructions that have decremented LOAD_CNT or DS_CNT on completion
+    // will have needed to wait for their register sources to be available
+    // first.
     if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
       unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
       Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
@@ -1654,13 +2124,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Wait.LoadCnt = ~0u;
       Wait.DsCnt = ~0u;
 
-      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applied pre-existing waitcnt\n"
-                              << "New Instr at block end: "
-                              << *CombinedLoadDsCntInstr << '\n'
-                     : dbgs() << "applied pre-existing waitcnt\n"
-                              << "Old Instr: " << *It << "New Instr: "
-                              << *CombinedLoadDsCntInstr << '\n');
+      LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
+                                     << "New Instr at block end: "
+                                     << *CombinedLoadDsCntInstr << '\n'
+                            : dbgs() << "applied pre-existing waitcnt\n"
+                                     << "Old Instr: " << *It << "New Instr: "
+                                     << *CombinedLoadDsCntInstr << '\n');
     } else {
       CombinedLoadDsCntInstr->eraseFromParent();
       Modified = true;
@@ -1679,13 +2148,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       Wait.StoreCnt = ~0u;
       Wait.DsCnt = ~0u;
 
-      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
-                     ? dbgs() << "applied pre-existing waitcnt\n"
-                              << "New Instr at block end: "
-                              << *CombinedStoreDsCntInstr << '\n'
-                     : dbgs() << "applied pre-existing waitcnt\n"
-                              << "Old Instr: " << *It << "New Instr: "
-                              << *CombinedStoreDsCntInstr << '\n');
+      LLVM_DEBUG(It.isEnd() ? dbgs() << "applied pre-existing waitcnt\n"
+                                     << "New Instr at block end: "
+                                     << *CombinedStoreDsCntInstr << '\n'
+                            : dbgs() << "applied pre-existing waitcnt\n"
+                                     << "Old Instr: " << *It << "New Instr: "
+                                     << *CombinedStoreDsCntInstr << '\n');
     } else {
       CombinedStoreDsCntInstr->eraseFromParent();
       Modified = true;
@@ -1729,7 +2197,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     if (!WaitInstrs[CT])
       continue;
 
-    unsigned NewCnt = getWait(Wait, CT);
+    unsigned NewCnt = Wait.get(CT);
     if (NewCnt != ~0u) {
       Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
                                            AMDGPU::OpName::simm16, NewCnt);
@@ -1738,7 +2206,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
       ScoreBrackets.applyWaitcnt(CT, NewCnt);
       setNoWait(Wait, CT);
 
-      LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+      LLVM_DEBUG(It.isEnd()
                      ? dbgs() << "applied pre-existing waitcnt\n"
                               << "New Instr at block end: " << *WaitInstrs[CT]
                               << '\n'
@@ -1751,19 +2219,86 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
     }
   }
 
+  if (WaitcntDepctrInstr) {
+    // Get the encoded Depctr immediate and override the VA_VDST and VM_VSRC
+    // subfields with the new required values.
+    unsigned Enc =
+        TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
+            ->getImm();
+    Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, Wait.VmVsrc);
+    Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
+
+    ScoreBrackets.applyWaitcnt(VA_VDST, Wait.VaVdst);
+    ScoreBrackets.applyWaitcnt(VM_VSRC, Wait.VmVsrc);
+    Wait.VaVdst = ~0u;
+    Wait.VmVsrc = ~0u;
+
+    // If that new encoded Depctr immediate would actually still wait
+    // for anything, update the instruction's operand. Otherwise it can
+    // just be deleted.
+    if (Enc != (unsigned)AMDGPU::DepCtr::getDefaultDepCtrEncoding(ST)) {
+      Modified |= updateOperandIfDifferent(*WaitcntDepctrInstr,
+                                           AMDGPU::OpName::simm16, Enc);
+      LLVM_DEBUG(It.isEnd() ? dbgs() << "applyPreexistingWaitcnt\n"
+                                     << "New Instr at block end: "
+                                     << *WaitcntDepctrInstr << '\n'
+                            : dbgs() << "applyPreexistingWaitcnt\n"
+                                     << "Old Instr: " << *It << "New Instr: "
+                                     << *WaitcntDepctrInstr << '\n');
+    } else {
+      WaitcntDepctrInstr->eraseFromParent();
+      Modified = true;
+    }
+  }
+
   return Modified;
 }
 
 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
-  assert(ST);
+    AMDGPU::Waitcnt Wait, const WaitcntBrackets &ScoreBrackets) {
   assert(!isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
+  // Helper to emit expanded waitcnt sequence for profiling.
+  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+                                 auto EmitWaitcnt) {
+    for (unsigned I = Outstanding - 1; I > Target && I != ~0u; --I)
+      EmitWaitcnt(I);
+    EmitWaitcnt(Target);
+    Modified = true;
+  };
+
+  // For GFX12+, we use separate wait instructions, which makes expansion
+  // simpler
+  if (ExpandWaitcntProfiling) {
+    for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+      unsigned Count = Wait.get(CT);
+      if (Count == ~0u)
+        continue;
+
+      // Skip expansion for out-of-order counters - emit normal wait instead
+      if (ScoreBrackets.counterOutOfOrder(CT)) {
+        BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Count);
+        Modified = true;
+        continue;
+      }
+
+      unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
+                                      getWaitCountMax(getLimits(), CT) - 1);
+      EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
+        BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Val);
+      });
+    }
+    return Modified;
+  }
+
+  // Normal behavior (no expansion)
   // Check for opportunities to use combined wait instructions.
   if (Wait.DsCnt != ~0u) {
     MachineInstr *SWaitInst = nullptr;
@@ -1771,7 +2306,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     if (Wait.LoadCnt != ~0u) {
       unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
 
-      SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+      SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
                       .addImm(Enc);
 
       Wait.LoadCnt = ~0u;
@@ -1779,9 +2314,8 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     } else if (Wait.StoreCnt != ~0u) {
       unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
 
-      SWaitInst =
-          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
-              .addImm(Enc);
+      SWaitInst = BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAIT_STORECNT_DSCNT))
+                      .addImm(Enc);
 
       Wait.StoreCnt = ~0u;
       Wait.DsCnt = ~0u;
@@ -1790,7 +2324,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     if (SWaitInst) {
       Modified = true;
 
-      LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+      LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
                  if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
                  dbgs() << "New Instr: " << *SWaitInst << '\n');
     }
@@ -1800,16 +2334,31 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
   // waiting for.
 
   for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
-    unsigned Count = getWait(Wait, CT);
+    unsigned Count = Wait.get(CT);
     if (Count == ~0u)
       continue;
 
     [[maybe_unused]] auto SWaitInst =
-        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+        BuildMI(Block, It, DL, TII.get(instrsForExtendedCounterTypes[CT]))
             .addImm(Count);
 
     Modified = true;
 
+    LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
+               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+               dbgs() << "New Instr: " << *SWaitInst << '\n');
+  }
+
+  if (Wait.hasWaitDepctr()) {
+    assert(IsExpertMode);
+    unsigned Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Wait.VmVsrc, ST);
+    Enc = AMDGPU::DepCtr::encodeFieldVaVdst(Enc, Wait.VaVdst);
+
+    [[maybe_unused]] auto SWaitInst =
+        BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_DEPCTR)).addImm(Enc);
+
+    Modified = true;
+
     LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
                if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
                dbgs() << "New Instr: " << *SWaitInst << '\n');
@@ -1818,19 +2367,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
   return Modified;
 }
 
-/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
-static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
-  // Currently all conventions wait, but this may not always be the case.
-  //
-  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
-  // senses to omit the wait and do it in the caller.
-  return true;
-}
-
-/// \returns true if the callee is expected to wait for any outstanding waits
-/// before returning.
-static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
-
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -1841,12 +2377,13 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
 ///  and if so what the value of each counter is.
 ///  The "score bracket" is bound by the lower bound and upper bound
 ///  scores (*_score_LB and *_score_ub respectively).
-///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
-///  flush the vmcnt counter here.
-bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
-                                                 WaitcntBrackets &ScoreBrackets,
-                                                 MachineInstr *OldWaitcntInstr,
-                                                 bool FlushVmCnt) {
+///  If FlushFlags.FlushVmCnt is true, we want to flush the vmcnt counter here.
+///  If FlushFlags.FlushDsCnt is true, we want to flush the dscnt counter here
+///  (GFX12+ only, where DS_CNT is a separate counter).
+bool SIInsertWaitcnts::generateWaitcntInstBefore(
+    MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+    MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
+  LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
   setForceEmitWaitcnt();
 
   assert(!MI.isMetaInstruction());
@@ -1854,54 +2391,70 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   AMDGPU::Waitcnt Wait;
   const unsigned Opc = MI.getOpcode();
 
-  // FIXME: This should have already been handled by the memory legalizer.
-  // Removing this currently doesn't affect any lit tests, but we need to
-  // verify that nothing was relying on this. The number of buffer invalidates
-  // being handled here should not be expanded.
-  if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
-      Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
-      Opc == AMDGPU::BUFFER_GL1_INV) {
-    Wait.LoadCnt = 0;
-  }
-
-  // All waits must be resolved at call return.
-  // NOTE: this could be improved with knowledge of all call sites or
-  //   with knowledge of the called routines.
-  if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
-      Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
-      Opc == AMDGPU::S_SETPC_B64_return ||
-      (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
-    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
-  }
-  // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
-  // Technically the hardware will do this on its own if we don't, but that
-  // might cost extra cycles compared to doing it explicitly.
-  // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
-  // have to wait for outstanding VMEM stores. In this case it can be useful to
-  // send a message to explicitly release all VGPRs before the stores have
-  // completed, but it is only safe to do this if there are no outstanding
-  // scratch stores.
-  else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
-    if (!WCG->isOptNone() &&
-        (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
-         (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
-          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
-          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
-      ReleaseVGPRInsts.insert(&MI);
-  }
-  // Resolve vm waits before gs-done.
-  else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
-           ST->hasLegacyGeometry() &&
-           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
-            AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
+  switch (Opc) {
+  case AMDGPU::BUFFER_WBINVL1:
+  case AMDGPU::BUFFER_WBINVL1_SC:
+  case AMDGPU::BUFFER_WBINVL1_VOL:
+  case AMDGPU::BUFFER_GL0_INV:
+  case AMDGPU::BUFFER_GL1_INV: {
+    // FIXME: This should have already been handled by the memory legalizer.
+    // Removing this currently doesn't affect any lit tests, but we need to
+    // verify that nothing was relying on this. The number of buffer invalidates
+    // being handled here should not be expanded.
     Wait.LoadCnt = 0;
+    break;
+  }
+  case AMDGPU::SI_RETURN_TO_EPILOG:
+  case AMDGPU::SI_RETURN:
+  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
+  case AMDGPU::S_SETPC_B64_return: {
+    // All waits must be resolved at call return.
+    // NOTE: this could be improved with knowledge of all call sites or
+    //   with knowledge of the called routines.
+    ReturnInsts.insert(&MI);
+    AMDGPU::Waitcnt AllZeroWait =
+        WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
+    // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
+    // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
+    // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
+    // no need to wait for it at function boundaries.
+    if (ST->hasExtendedWaitCounts() &&
+        !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
+      AllZeroWait.LoadCnt = ~0u;
+    Wait = AllZeroWait;
+    break;
+  }
+  case AMDGPU::S_ENDPGM:
+  case AMDGPU::S_ENDPGM_SAVED: {
+    // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
+    // Technically the hardware will do this on its own if we don't, but that
+    // might cost extra cycles compared to doing it explicitly.
+    // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
+    // have to wait for outstanding VMEM stores. In this case it can be useful
+    // to send a message to explicitly release all VGPRs before the stores have
+    // completed, but it is only safe to do this if there are no outstanding
+    // scratch stores.
+    EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&
+                       !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
+    break;
+  }
+  case AMDGPU::S_SENDMSG:
+  case AMDGPU::S_SENDMSGHALT: {
+    if (ST->hasLegacyGeometry() &&
+        ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
+         AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
+      // Resolve vm waits before gs-done.
+      Wait.LoadCnt = 0;
+      break;
+    }
+    [[fallthrough]];
   }
+  default: {
 
-  // Export & GDS instructions do not read the EXEC mask until after the export
-  // is granted (which can occur well after the instruction is issued).
-  // The shader program must flush all EXP operations on the export-count
-  // before overwriting the EXEC mask.
-  else {
+    // Export & GDS instructions do not read the EXEC mask until after the
+    // export is granted (which can occur well after the instruction is issued).
+    // The shader program must flush all EXP operations on the export-count
+    // before overwriting the EXEC mask.
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
@@ -1918,27 +2471,22 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
     if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
       addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
 
-    if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+    if (MI.isCall()) {
       // The function is going to insert a wait on everything in its prolog.
       // This still needs to be careful if the call target is a load (e.g. a GOT
       // load). We also need to check WAW dependency with saved PC.
+      CallInsts.insert(&MI);
       Wait = AMDGPU::Waitcnt();
 
-      const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
       if (CallAddrOp.isReg()) {
-        RegInterval CallAddrOpInterval =
-            ScoreBrackets.getRegInterval(&MI, CallAddrOp);
-
-        ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
-                                    Wait);
+        ScoreBrackets.determineWaitForPhysReg(
+            SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
 
         if (const auto *RtnAddrOp =
                 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
-          RegInterval RtnAddrOpInterval =
-              ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);
-
-          ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
-                                      Wait);
+          ScoreBrackets.determineWaitForPhysReg(
+              SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
         }
       }
     } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
@@ -1975,18 +2523,26 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           continue;
 
         // LOAD_CNT is only relevant to vgpr or LDS.
-        unsigned RegNo = FIRST_LDS_VGPR;
+        unsigned TID = LDSDMA_BEGIN;
         if (Ptr && Memop->getAAInfo()) {
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
-            if (MI.mayAlias(AA, *LDSDMAStores[I], true))
-              ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
+            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
+              if ((I + 1) >= NUM_LDSDMA) {
+                // We didn't have enough slot to track this LDS DMA store, it
+                // has been tracked using the common RegNo (FIRST_LDS_VGPR).
+                ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
+                break;
+              }
+
+              ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
+            }
           }
         } else {
-          ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+          ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
         }
         if (Memop->isStore()) {
-          ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
+          ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
         }
       }
 
@@ -1999,7 +2555,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
           continue;
 
-        RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);
+        MCPhysReg Reg = Op.getReg().asMCReg();
 
         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
         if (IsVGPR) {
@@ -2011,6 +2567,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           if (Op.isImplicit() && MI.mayLoadOrStore())
             continue;
 
+          ScoreBrackets.determineWaitForPhysReg(VA_VDST, Reg, Wait);
+          if (Op.isDef())
+            ScoreBrackets.determineWaitForPhysReg(VM_VSRC, Reg, Wait);
           // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
           // previous write and this write are the same type of VMEM
           // instruction, in which case they are (in some architectures)
@@ -2018,31 +2577,31 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           // Additionally check instructions where Point Sample Acceleration
           // might be applied.
           if (Op.isUse() || !updateVMCntOnly(MI) ||
-              ScoreBrackets.hasOtherPendingVmemTypes(Interval,
-                                                     getVmemType(MI)) ||
-              ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
+              ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
+              ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
               !ST->hasVmemWriteVgprInOrder()) {
-            ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
-            ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
-            ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
-            ScoreBrackets.clearVgprVmemTypes(Interval);
+            ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
+            ScoreBrackets.clearVgprVmemTypes(Reg);
           }
 
           if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
-            ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
+            ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
           }
-          ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
+          ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
         } else if (Op.getReg() == AMDGPU::SCC) {
-          ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
+          ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
         } else {
-          ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
+          ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
         }
 
-        if (ST->hasWaitXCnt() && Op.isDef())
-          ScoreBrackets.determineWait(X_CNT, Interval, Wait);
+        if (ST->hasWaitXcnt() && Op.isDef())
+          ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
       }
     }
   }
+  }
 
   // Ensure safety against exceptions from outstanding memory operations while
   // waiting for a barrier:
@@ -2057,7 +2616,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // In all other cases, ensure safety by ensuring that there are no outstanding
   // memory operations.
   if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
-      !ST->supportsBackOffBarrier()) {
+      !ST->hasBackOffBarrier()) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
 
@@ -2072,35 +2631,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // Verify that the wait is actually needed.
   ScoreBrackets.simplifyWaitcnt(Wait);
 
+  // It is only necessary to insert an S_WAITCNT_DEPCTR instruction that
+  // waits on VA_VDST if the instruction it would precede is not a VALU
+  // instruction, since hardware handles VALU->VGPR->VALU hazards in
+  // expert scheduling mode.
+  if (TII->isVALU(MI))
+    Wait.VaVdst = ~0u;
+
+  // Since the translation for VMEM addresses occur in-order, we can apply the
+  // XCnt if the current instruction is of VMEM type and has a memory
+  // dependency with another VMEM instruction in flight.
+  if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
+    ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
+    Wait.XCnt = ~0u;
+  }
+
   // When forcing emit, we need to skip terminators because that would break the
   // terminators of the MBB if we emit a waitcnt between terminators.
   if (ForceEmitZeroFlag && !MI.isTerminator())
     Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
 
-  if (ForceEmitWaitcnt[LOAD_CNT])
-    Wait.LoadCnt = 0;
-  if (ForceEmitWaitcnt[EXP_CNT])
-    Wait.ExpCnt = 0;
-  if (ForceEmitWaitcnt[DS_CNT])
-    Wait.DsCnt = 0;
-  if (ForceEmitWaitcnt[SAMPLE_CNT])
-    Wait.SampleCnt = 0;
-  if (ForceEmitWaitcnt[BVH_CNT])
-    Wait.BvhCnt = 0;
-  if (ForceEmitWaitcnt[KM_CNT])
-    Wait.KmCnt = 0;
-  if (ForceEmitWaitcnt[X_CNT])
-    Wait.XCnt = 0;
-
-  if (FlushVmCnt) {
-    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
-      Wait.LoadCnt = 0;
-    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
-      Wait.SampleCnt = 0;
-    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
-      Wait.BvhCnt = 0;
+  // If we force waitcnt then update Wait accordingly.
+  for (InstCounterType T : inst_counter_types()) {
+    if (!ForceEmitWaitcnt[T])
+      continue;
+    Wait.set(T, 0);
+  }
+
+  if (FlushFlags.FlushVmCnt) {
+    for (InstCounterType T : {LOAD_CNT, SAMPLE_CNT, BVH_CNT})
+      Wait.set(T, 0);
   }
 
+  if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+    Wait.DsCnt = 0;
+
   if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
     Wait.LoadCnt = 0;
 
@@ -2121,10 +2686,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
     Modified =
         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
 
-  // Any counts that could have been applied to any existing waitcnt
-  // instructions will have been done so, now deal with any remaining.
-  ScoreBrackets.applyWaitcnt(Wait);
-
   // ExpCnt can be merged into VINTERP.
   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
       SIInstrInfo::isVINTERP(*It)) {
@@ -2134,31 +2695,59 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
       WaitExp->setImm(Wait.ExpCnt);
       Modified = true;
     }
+    // Apply ExpCnt before resetting it, so applyWaitcnt below sees all counts.
+    ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
     Wait.ExpCnt = ~0u;
 
     LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
                       << "Update Instr: " << *It);
   }
 
-  // XCnt may be already consumed by a load wait.
-  if (Wait.XCnt != ~0u) {
-    if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
-      Wait.XCnt = ~0u;
+  if (WCG->createNewWaitcnt(Block, It, Wait, ScoreBrackets))
+    Modified = true;
 
-    if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
-      Wait.XCnt = ~0u;
+  // Any counts that could have been applied to any existing waitcnt
+  // instructions will have been done so, now deal with any remaining.
+  ScoreBrackets.applyWaitcnt(Wait);
 
-    // Since the translation for VMEM addresses occur in-order, we can skip the
-    // XCnt if the current instruction is of VMEM type and has a memory
-    // dependency with another VMEM instruction in flight.
-    if (isVmemAccess(*It))
-      Wait.XCnt = ~0u;
+  return Modified;
+}
+
+std::optional<WaitEventType>
+SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
+  if (TII->isVALU(Inst)) {
+    // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
+    // out-of-order with respect to each other, so each of these classes
+    // has its own event.
+
+    if (TII->isXDL(Inst))
+      return VGPR_XDL_WRITE;
+
+    if (TII->isTRANS(Inst))
+      return VGPR_TRANS_WRITE;
+
+    if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
+      return VGPR_DPMACC_WRITE;
+
+    return VGPR_CSMACC_WRITE;
   }
 
-  if (WCG->createNewWaitcnt(Block, It, Wait))
-    Modified = true;
+  // FLAT and LDS instructions may read their VGPR sources out-of-order
+  // with respect to each other and all other VMEM instructions, so
+  // each of these also has a separate event.
 
-  return Modified;
+  if (TII->isFLAT(Inst))
+    return VGPR_FLAT_READ;
+
+  if (TII->isDS(Inst))
+    return VGPR_LDS_READ;
+
+  if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
+    return VGPR_VMEM_READ;
+
+  // Otherwise, no hazard.
+
+  return {};
 }
 
 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
@@ -2235,6 +2824,12 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
 
   bool IsVMEMAccess = false;
   bool IsSMEMAccess = false;
+
+  if (IsExpertMode) {
+    if (const auto ET = getExpertSchedulingEventType(Inst))
+      ScoreBrackets->updateByEvent(*ET, Inst);
+  }
+
   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -2265,13 +2860,19 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
     }
 
-    // This is a flat memory operation that access both VMEM and LDS, so note it
-    // - it will require that both the VM and LGKM be flushed to zero if it is
-    // pending when a VM or LGKM dependency occurs.
-    if (FlatASCount > 1)
+    // Async/LDSDMA operations have FLAT encoding but do not actually use flat
+    // pointers. They do have two operands that each access global and LDS, thus
+    // making it appear at this point that they are using a flat pointer. Filter
+    // them out, and for the rest, generate a dependency on flat pointers so
+    // that both VM and LGKM counters are flushed.
+    if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
       ScoreBrackets->setPendingFlat();
   } else if (SIInstrInfo::isVMEM(Inst) &&
-             !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
+             (!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||
+              Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
+    // BUFFER_WBL2 is included here because unlike invalidates, has to be
+    // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
+    // completed.
     IsVMEMAccess = true;
     ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
 
@@ -2283,15 +2884,9 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     IsSMEMAccess = true;
     ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
   } else if (Inst.isCall()) {
-    if (callWaitsOnFunctionReturn(Inst)) {
-      // Act as a wait on everything
-      ScoreBrackets->applyWaitcnt(
-          WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
-      ScoreBrackets->setStateOnFunctionEntryOrReturn();
-    } else {
-      // May need to way wait for anything.
-      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
-    }
+    // Act as a wait on everything
+    ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
+    ScoreBrackets->setStateOnFunctionEntryOrReturn();
   } else if (SIInstrInfo::isLDSDIR(Inst)) {
     ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
   } else if (TII->isVINTERP(Inst)) {
@@ -2324,7 +2919,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     }
   }
 
-  if (!ST->hasWaitXCnt())
+  if (!ST->hasWaitXcnt())
     return;
 
   if (IsVMEMAccess)
@@ -2343,6 +2938,84 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
   return OtherShifted > MyShifted;
 }
 
+bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
+                                      ArrayRef<CounterValueArray> OtherMarks) {
+  bool StrictDom = false;
+
+  LLVM_DEBUG(dbgs() << "Merging async marks ...");
+  // Early exit: both empty
+  if (AsyncMarks.empty() && OtherMarks.empty()) {
+    LLVM_DEBUG(dbgs() << " nothing to merge\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << '\n');
+
+  // Determine maximum length needed after merging
+  auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
+
+  // For each backedge in isolation, the algorithm reachs a fixed point after
+  // the first call to merge(). This is unchanged even with the AsyncMarks
+  // array because we call mergeScore just like the other cases.
+  //
+  // But in the rare pathological case, a nest of loops that pushes marks
+  // without waiting on any mark can cause AsyncMarks to grow very large. We cap
+  // it to a reasonable limit. We can tune this later or potentially introduce a
+  // user option to control the value.
+  MaxSize = std::min(MaxSize, MaxAsyncMarks);
+
+  // Keep only the most recent marks within our limit.
+  if (AsyncMarks.size() > MaxSize)
+    AsyncMarks.erase(AsyncMarks.begin(),
+                     AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
+
+  // Pad with zero-filled marks if our list is shorter. Zero represents "no
+  // pending async operations at this checkpoint" and acts as the identity
+  // element for max() during merging. We pad at the beginning since the marks
+  // need to be aligned in most-recent order.
+  CounterValueArray ZeroMark{};
+  AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
+
+  LLVM_DEBUG({
+    dbgs() << "Before merge:\n";
+    for (const auto &Mark : AsyncMarks) {
+      llvm::interleaveComma(Mark, dbgs());
+      dbgs() << '\n';
+    }
+  });
+
+  LLVM_DEBUG({
+    dbgs() << "Other marks:\n";
+    for (const auto &Mark : OtherMarks) {
+      llvm::interleaveComma(Mark, dbgs());
+      dbgs() << '\n';
+    }
+  });
+
+  // Merge element-wise using the existing mergeScore function and the
+  // appropriate MergeInfo for each counter type. Iterate only while we have
+  // elements in both vectors.
+  unsigned OtherSize = OtherMarks.size();
+  unsigned OurSize = AsyncMarks.size();
+  unsigned MergeCount = std::min(OtherSize, OurSize);
+  assert(OurSize == MaxSize);
+  for (unsigned Idx = 1; Idx <= MergeCount; ++Idx) {
+    for (auto T : inst_counter_types(Context->MaxCounter)) {
+      StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
+                              OtherMarks[OtherSize - Idx][T]);
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "After merge:\n";
+    for (const auto &Mark : AsyncMarks) {
+      llvm::interleaveComma(Mark, dbgs());
+      dbgs() << '\n';
+    }
+  });
+
+  return StrictDom;
+}
+
 /// Merge the pending events and associater score brackets of \p Other into
 /// this brackets status.
 ///
@@ -2351,15 +3024,22 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
   bool StrictDom = false;
 
-  VgprUB = std::max(VgprUB, Other.VgprUB);
-  SgprUB = std::max(SgprUB, Other.SgprUB);
+  // Check if "other" has keys we don't have, and create default entries for
+  // those. If they remain empty after merging, we will clean it up after.
+  for (auto K : Other.VMem.keys())
+    VMem.try_emplace(K);
+  for (auto K : Other.SGPRs.keys())
+    SGPRs.try_emplace(K);
+
+  // Array to store MergeInfo for each counter type
+  MergeInfo MergeInfos[NUM_INST_CNTS];
 
   for (auto T : inst_counter_types(Context->MaxCounter)) {
     // Merge event flags for this counter
-    const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
-    const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
-    const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
-    if (OtherEvents & ~OldEvents)
+    const WaitEventSet &EventsForT = Context->getWaitEvents(T);
+    const WaitEventSet OldEvents = PendingEvents & EventsForT;
+    const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
+    if (!OldEvents.contains(OtherEvents))
       StrictDom = true;
     PendingEvents |= OtherEvents;
 
@@ -2370,7 +3050,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
     if (NewUB < ScoreLBs[T])
       report_fatal_error("waitcnt score overflow");
 
-    MergeInfo M;
+    MergeInfo &M = MergeInfos[T];
     M.OldLB = ScoreLBs[T];
     M.OtherLB = Other.ScoreLBs[T];
     M.MyShift = NewUB - ScoreUBs[T];
@@ -2386,8 +3066,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
     if (T == KM_CNT) {
       StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
       if (Other.hasPendingEvent(SCC_WRITE)) {
-        unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
-        if (!OldEventsHasSCCWrite) {
+        if (!OldEvents.contains(SCC_WRITE)) {
           PendingSCCWrite = Other.PendingSCCWrite;
         } else if (PendingSCCWrite != Other.PendingSCCWrite) {
           PendingSCCWrite = nullptr;
@@ -2395,23 +3074,33 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
       }
     }
 
-    for (int J = 0; J <= VgprUB; J++)
-      StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
+    for (auto &[RegID, Info] : VMem)
+      StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
 
     if (isSmemCounter(T)) {
       unsigned Idx = getSgprScoresIdx(T);
-      for (int J = 0; J <= SgprUB; J++)
-        StrictDom |=
-            mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
+      for (auto &[RegID, Info] : SGPRs) {
+        auto It = Other.SGPRs.find(RegID);
+        unsigned OtherScore =
+            (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
+        StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
+      }
     }
   }
 
-  for (int J = 0; J <= VgprUB; J++) {
-    unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
-    StrictDom |= NewVmemTypes != VgprVmemTypes[J];
-    VgprVmemTypes[J] = NewVmemTypes;
+  for (auto &[TID, Info] : VMem) {
+    if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
+      unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
+      StrictDom |= NewVmemTypes != Info.VMEMTypes;
+      Info.VMEMTypes = NewVmemTypes;
+    }
   }
 
+  StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
+  for (auto T : inst_counter_types(Context->MaxCounter))
+    StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
+
+  purgeEmptyTrackingData();
   return StrictDom;
 }
 
@@ -2423,9 +3112,53 @@ static bool isWaitInstr(MachineInstr &Inst) {
          Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
          Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
          Opcode == AMDGPU::S_WAITCNT_lds_direct ||
+         Opcode == AMDGPU::WAIT_ASYNCMARK ||
          counterTypeForInstr(Opcode).has_value();
 }
 
+void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         bool ExpertMode) const {
+  const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
+      AMDGPU::Hwreg::ID_SCHED_MODE, AMDGPU::Hwreg::HwregOffset::Default, 2);
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
+      .addImm(ExpertMode ? 2 : 0)
+      .addImm(EncodedReg);
+}
+
+// Track back-to-back atomic RMW instructions, referred to as a block.
+//
+// Determines whether \p MI starts a new atomic RMW block, is inside
+// an existing block, or is outside of a block. A block is broken when a
+// CU-scoped memory op or an atomic store is encountered. ALU ops
+// and non-memory instructions don't break a block. The function returns
+// the new state after processing the current instruction based on
+// \p PrevState, the previously captured state.
+AtomicRMWState
+SIInsertWaitcnts::getAtomicRMWState(MachineInstr &MI,
+                                    AtomicRMWState PrevState) const {
+  if (isAtomicRMW(MI)) {
+    // Transition from NotInBlock -> NewBlock -> InsideBlock.
+    if (PrevState == AtomicRMWState::NotInBlock)
+      return AtomicRMWState::NewBlock;
+    if (PrevState == AtomicRMWState::NewBlock)
+      return AtomicRMWState::InsideBlock;
+
+    return PrevState;
+  }
+
+  // LDS memory operations don't break the block.
+  if (TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI)))
+    return PrevState;
+
+  // Reset the atomic RMW block state when found other VMEM and SMEM operations.
+  if (MI.mayLoad() ^ MI.mayStore())
+    return AtomicRMWState::NotInBlock;
+
+  // Return the previous state otherwise.
+  return PrevState;
+}
+
 // Generate s_waitcnt instructions where needed.
 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
                                             MachineBasicBlock &Block,
@@ -2454,6 +3187,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
+  AtomicRMWState RMWState = AtomicRMWState::NotInBlock;
 
   for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
                                          E = Block.instr_end();
@@ -2463,22 +3197,50 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       ++Iter;
       continue;
     }
+    // Get the atomic RMW block state for current instruction.
+    RMWState = getAtomicRMWState(Inst, RMWState);
 
     // Track pre-existing waitcnts that were added in earlier iterations or by
     // the memory legalizer.
-    if (isWaitInstr(Inst)) {
-      if (!OldWaitcntInstr)
-        OldWaitcntInstr = &Inst;
+    if (isWaitInstr(Inst) ||
+        (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
       ++Iter;
+      bool IsSoftXcnt = isSoftXcnt(Inst);
+      // The Memory Legalizer conservatively inserts a soft xcnt before each
+      // atomic RMW operation. However, for sequences of back-to-back atomic
+      // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
+      // the redundant soft xcnts when we're inside an atomic RMW block.
+      if (Iter != E && IsSoftXcnt) {
+        // Check if the next instruction can potentially change the atomic RMW
+        // state.
+        RMWState = getAtomicRMWState(*Iter, RMWState);
+      }
+
+      if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) {
+        // Delete this soft xcnt.
+        Inst.eraseFromParent();
+        Modified = true;
+      } else if (!OldWaitcntInstr) {
+        OldWaitcntInstr = &Inst;
+      }
       continue;
     }
 
-    bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
-                      isPreheaderToFlush(Block, ScoreBrackets);
+    PreheaderFlushFlags FlushFlags;
+    if (Block.getFirstTerminator() == Inst)
+      FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
+
+    if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
+      // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
+      assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
+      ScoreBrackets.recordAsyncMark(Inst);
+      ++Iter;
+      continue;
+    }
 
     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
-                                          FlushVmCnt);
+                                          FlushFlags);
     OldWaitcntInstr = nullptr;
 
     // Restore vccz if it's not known to be correct already.
@@ -2552,17 +3314,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ++Iter;
   }
 
-  // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
-  // needed.
+  // Flush counters at the end of the block if needed (for preheaders with no
+  // terminator).
   AMDGPU::Waitcnt Wait;
-  if (Block.getFirstTerminator() == Block.end() &&
-      isPreheaderToFlush(Block, ScoreBrackets)) {
-    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
-      Wait.LoadCnt = 0;
-    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
-      Wait.SampleCnt = 0;
-    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
-      Wait.BvhCnt = 0;
+  if (Block.getFirstTerminator() == Block.end()) {
+    PreheaderFlushFlags FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
+    if (FlushFlags.FlushVmCnt) {
+      if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+        Wait.LoadCnt = 0;
+      if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+        Wait.SampleCnt = 0;
+      if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+        Wait.BvhCnt = 0;
+    }
+    if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
+      Wait.DsCnt = 0;
   }
 
   // Combine or remove any redundant waitcnts at the end of the block.
@@ -2578,29 +3344,29 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   return Modified;
 }
 
-// Return true if the given machine basic block is a preheader of a loop in
-// which we want to flush the vmcnt counter, and false otherwise.
-bool SIInsertWaitcnts::isPreheaderToFlush(
-    MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
-  auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
+// Return flags indicating which counters should be flushed in the preheader.
+PreheaderFlushFlags
+SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
+                                     const WaitcntBrackets &ScoreBrackets) {
+  auto [Iterator, IsInserted] =
+      PreheadersToFlush.try_emplace(&MBB, PreheaderFlushFlags());
   if (!IsInserted)
     return Iterator->second;
 
   MachineBasicBlock *Succ = MBB.getSingleSuccessor();
   if (!Succ)
-    return false;
+    return PreheaderFlushFlags();
 
   MachineLoop *Loop = MLI->getLoopFor(Succ);
   if (!Loop)
-    return false;
+    return PreheaderFlushFlags();
 
-  if (Loop->getLoopPreheader() == &MBB &&
-      shouldFlushVmCnt(Loop, ScoreBrackets)) {
-    Iterator->second = true;
-    return true;
+  if (Loop->getLoopPreheader() == &MBB) {
+    Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
+    return Iterator->second;
   }
 
-  return false;
+  return PreheaderFlushFlags();
 }
 
 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
@@ -2609,72 +3375,152 @@ bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
   return SIInstrInfo::isVMEM(MI);
 }
 
-// Return true if it is better to flush the vmcnt counter in the preheader of
-// the given loop. We currently decide to flush in two situations:
+bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
+  return SIInstrInfo::isDS(MI) && MI.mayLoad() && !MI.mayStore();
+}
+
+// Check if instruction is a store to LDS that is counted via DSCNT
+// (where that counter exists).
+bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
+  if (!MI.mayStore())
+    return false;
+  if (SIInstrInfo::isDS(MI))
+    return true;
+  return false;
+}
+
+// Return flags indicating which counters should be flushed in the preheader of
+// the given loop. We currently decide to flush in a few situations:
+// For VMEM (FlushVmCnt):
 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
 //    vgpr containing a value that is loaded outside of the loop. (Only on
 //    targets with no vscnt counter).
 // 2. The loop contains vmem load(s), but the loaded values are not used in the
 //    loop, and at least one use of a vgpr containing a value that is loaded
 //    outside of the loop.
-bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
-                                        const WaitcntBrackets &Brackets) {
+// For DS (FlushDsCnt, GFX12+ only):
+// 3. The loop contains no DS reads, and at least one use of a vgpr containing
+//    a value that is DS loaded outside of the loop.
+// 4. The loop contains DS read(s), loaded values are not used in the same
+//    iteration but in the next iteration (prefetch pattern), and at least one
+//    use of a vgpr containing a value that is DS loaded outside of the loop.
+//    Flushing in preheader reduces wait overhead if the wait requirement in
+//    iteration 1 would otherwise be more strict.
+PreheaderFlushFlags
+SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
+                                         const WaitcntBrackets &Brackets) {
+  PreheaderFlushFlags Flags;
   bool HasVMemLoad = false;
   bool HasVMemStore = false;
-  bool UsesVgprLoadedOutside = false;
-  DenseSet<Register> VgprUse;
-  DenseSet<Register> VgprDef;
+  bool SeenDSStoreInLoop = false;
+  bool UsesVgprLoadedOutsideVMEM = false;
+  bool UsesVgprLoadedOutsideDS = false;
+  bool VMemInvalidated = false;
+  // DS optimization only applies to GFX12+ where DS_CNT is separate.
+  bool DSInvalidated = !ST->hasExtendedWaitCounts();
+  DenseSet<MCRegUnit> VgprUse;
+  DenseSet<MCRegUnit> VgprDefVMEM;
+  DenseSet<MCRegUnit> VgprDefDS;
 
   for (MachineBasicBlock *MBB : ML->blocks()) {
+    bool SeenDSStoreInCurrMBB = false;
     for (MachineInstr &MI : *MBB) {
       if (isVMEMOrFlatVMEM(MI)) {
         HasVMemLoad |= MI.mayLoad();
         HasVMemStore |= MI.mayStore();
       }
-
+      if (mayStoreIncrementingDSCNT(MI))
+        SeenDSStoreInCurrMBB = true;
+      // Stores postdominated by a barrier will have a wait at the barrier
+      // and thus no need to be waited at the loop header. Barrier found
+      // later in the same MBB during in-order traversal is used here as a
+      // cheaper alternative to postdomination check.
+      if (MI.getOpcode() == AMDGPU::S_BARRIER)
+        SeenDSStoreInCurrMBB = false;
       for (const MachineOperand &Op : MI.all_uses()) {
         if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
           continue;
-        RegInterval Interval = Brackets.getRegInterval(&MI, Op);
         // Vgpr use
-        for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
           // If we find a register that is loaded inside the loop, 1. and 2.
-          // are invalidated and we can exit.
-          if (VgprDef.contains(RegNo))
-            return false;
-          VgprUse.insert(RegNo);
-          // If at least one of Op's registers is in the score brackets, the
-          // value is likely loaded outside of the loop.
-          if (Brackets.getRegScore(RegNo, LOAD_CNT) >
-                  Brackets.getScoreLB(LOAD_CNT) ||
-              Brackets.getRegScore(RegNo, SAMPLE_CNT) >
-                  Brackets.getScoreLB(SAMPLE_CNT) ||
-              Brackets.getRegScore(RegNo, BVH_CNT) >
-                  Brackets.getScoreLB(BVH_CNT)) {
-            UsesVgprLoadedOutside = true;
-            break;
-          }
+          // are invalidated.
+          if (VgprDefVMEM.contains(RU))
+            VMemInvalidated = true;
+
+          // Check for DS loads used inside the loop
+          if (VgprDefDS.contains(RU))
+            DSInvalidated = true;
+
+          // Early exit if both optimizations are invalidated
+          if (VMemInvalidated && DSInvalidated)
+            return Flags;
+
+          VgprUse.insert(RU);
+          // Check if this register has a pending VMEM load from outside the
+          // loop (value loaded outside and used inside).
+          VMEMID ID = toVMEMID(RU);
+          if (Brackets.hasPendingVMEM(ID, LOAD_CNT) ||
+              Brackets.hasPendingVMEM(ID, SAMPLE_CNT) ||
+              Brackets.hasPendingVMEM(ID, BVH_CNT))
+            UsesVgprLoadedOutsideVMEM = true;
+          // Check if loaded outside the loop via DS (not VMEM/FLAT).
+          // Only consider it a DS load if there's no pending VMEM load for
+          // this register, since FLAT can set both counters.
+          else if (Brackets.hasPendingVMEM(ID, DS_CNT))
+            UsesVgprLoadedOutsideDS = true;
         }
       }
 
       // VMem load vgpr def
       if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
         for (const MachineOperand &Op : MI.all_defs()) {
-          RegInterval Interval = Brackets.getRegInterval(&MI, Op);
-          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
             // If we find a register that is loaded inside the loop, 1. and 2.
-            // are invalidated and we can exit.
-            if (VgprUse.contains(RegNo))
-              return false;
-            VgprDef.insert(RegNo);
+            // are invalidated.
+            if (VgprUse.contains(RU))
+              VMemInvalidated = true;
+            VgprDefVMEM.insert(RU);
+          }
+        }
+        // Early exit if both optimizations are invalidated
+        if (VMemInvalidated && DSInvalidated)
+          return Flags;
+      }
+
+      // DS read vgpr def
+      // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
+      // If USE comes before DEF, it's the prefetch pattern (use value from
+      // previous iteration, load for next iteration). We should still flush
+      // in preheader so iteration 1 doesn't need to wait inside the loop.
+      // Only invalidate when DEF comes before USE (same-iteration consumption,
+      // checked above when processing uses).
+      if (isDSRead(MI)) {
+        for (const MachineOperand &Op : MI.all_defs()) {
+          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
+            VgprDefDS.insert(RU);
           }
         }
       }
     }
+    // Accumulate unprotected DS stores from this MBB
+    SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
   }
-  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
-    return true;
-  return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
+
+  // VMEM flush decision
+  if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
+      ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
+       (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
+    Flags.FlushVmCnt = true;
+
+  // DS flush decision: flush if loop uses DS-loaded values from outside
+  // and either has no DS reads in the loop, or DS reads whose results
+  // are not used in the loop.
+  // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
+  // is LGKM_CNT which also tracks FLAT/SMEM.
+  if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
+    Flags.FlushDsCnt = true;
+
+  return Flags;
 }
 
 bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
@@ -2714,48 +3560,36 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
 
   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
 
+  // Initialize hardware limits first, as they're needed by the generators.
+  Limits = AMDGPU::HardwareLimits(IV);
+
   if (ST->hasExtendedWaitCounts()) {
-    MaxCounter = NUM_EXTENDED_INST_CNTS;
-    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
-    WCG = &WCGGFX12Plus;
+    IsExpertMode = ST->hasExpertSchedulingMode() &&
+                   (ExpertSchedulingModeFlag.getNumOccurrences()
+                        ? ExpertSchedulingModeFlag
+                        : MF.getFunction()
+                              .getFnAttribute("amdgpu-expert-scheduling-mode")
+                              .getValueAsBool());
+    MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
+    if (!WCG)
+      WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
+                                                        IsExpertMode);
   } else {
     MaxCounter = NUM_NORMAL_INST_CNTS;
-    WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
-    WCG = &WCGPreGFX12;
+    if (!WCG)
+      WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
+                                                       &Limits);
   }
 
   for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
-  WaitEventMaskForInst = WCG->getWaitEventMask();
-
-  SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
-
-  if (ST->hasExtendedWaitCounts()) {
-    Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
-    Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
-  } else {
-    Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
-    Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
-  }
-  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
-  Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
-  Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
-  Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
-  Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
-  Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
-
-  [[maybe_unused]] unsigned NumVGPRsMax =
-      ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
-  [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
-  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
-  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+  SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
 
   BlockInfos.clear();
   bool Modified = false;
 
   MachineBasicBlock &EntryBB = MF.front();
-  MachineBasicBlock::iterator I = EntryBB.begin();
 
   if (!MFI->isEntryFunction()) {
     // Wait for any outstanding memory operations that the input registers may
@@ -2764,9 +3598,9 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
 
     // TODO: Could insert earlier and schedule more liberally with operations
     // that only use caller preserved registers.
-    for (MachineBasicBlock::iterator E = EntryBB.end();
-         I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
-      ;
+    MachineBasicBlock::iterator I = EntryBB.begin();
+    while (I != EntryBB.end() && I->isMetaInstruction())
+      ++I;
 
     if (ST->hasExtendedWaitCounts()) {
       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
@@ -2783,6 +3617,12 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
                 TII->get(instrsForExtendedCounterTypes[CT]))
             .addImm(0);
       }
+      if (IsExpertMode) {
+        unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
+        Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, 0);
+        BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+            .addImm(Enc);
+      }
     } else {
       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
     }
@@ -2839,7 +3679,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
           if (!SuccBI.Incoming) {
             SuccBI.Dirty = true;
             if (SuccBII <= BII) {
-              LLVM_DEBUG(dbgs() << "repeat on backedge\n");
+              LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
               Repeat = true;
             }
             if (!MoveBracketsToSucc) {
@@ -2847,11 +3687,20 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
             } else {
               SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
             }
-          } else if (SuccBI.Incoming->merge(*Brackets)) {
-            SuccBI.Dirty = true;
-            if (SuccBII <= BII) {
-              LLVM_DEBUG(dbgs() << "repeat on backedge\n");
-              Repeat = true;
+          } else {
+            LLVM_DEBUG({
+              dbgs() << "Try to merge ";
+              MBB->printName(dbgs());
+              dbgs() << " into ";
+              Succ->printName(dbgs());
+              dbgs() << '\n';
+            });
+            if (SuccBI.Incoming->merge(*Brackets)) {
+              SuccBI.Dirty = true;
+              if (SuccBII <= BII) {
+                LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
+                Repeat = true;
+              }
             }
           }
         }
@@ -2907,26 +3756,49 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
     }
   }
 
+  if (IsExpertMode) {
+    // Enable expert scheduling on function entry. To satisfy ABI requirements
+    // and to allow calls between function with different expert scheduling
+    // settings, disable it around calls and before returns.
+
+    MachineBasicBlock::iterator I = EntryBB.begin();
+    while (I != EntryBB.end() && I->isMetaInstruction())
+      ++I;
+    setSchedulingMode(EntryBB, I, true);
+
+    for (MachineInstr *MI : CallInsts) {
+      MachineBasicBlock &MBB = *MI->getParent();
+      setSchedulingMode(MBB, MI, false);
+      setSchedulingMode(MBB, std::next(MI->getIterator()), true);
+    }
+
+    for (MachineInstr *MI : ReturnInsts)
+      setSchedulingMode(*MI->getParent(), MI, false);
+
+    Modified = true;
+  }
+
   // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
   // This is done in different ways depending on how the VGPRs were allocated
   // (i.e. whether we're in dynamic VGPR mode or not).
   // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
   // waveslot limited kernel runs slower with the deallocation.
-  if (MFI->isDynamicVGPREnabled()) {
-    for (MachineInstr *MI : ReleaseVGPRInsts) {
+  if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
+    for (auto [MI, _] : EndPgmInsts) {
       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
               TII->get(AMDGPU::S_ALLOC_VGPR))
           .addImm(0);
       Modified = true;
     }
-  } else {
-    if (!ReleaseVGPRInsts.empty() &&
-        (MF.getFrameInfo().hasCalls() ||
-         ST->getOccupancyWithNumVGPRs(
-             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
-             /*IsDynamicVGPR=*/false) <
-             AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
-      for (MachineInstr *MI : ReleaseVGPRInsts) {
+  } else if (!WCG->isOptNone() &&
+             ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+             (MF.getFrameInfo().hasCalls() ||
+              ST->getOccupancyWithNumVGPRs(
+                  TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
+                  /*IsDynamicVGPR=*/false) <
+                  AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+    for (auto [MI, Flag] : EndPgmInsts) {
+      if (Flag) {
         if (ST->requiresNopBeforeDeallocVGPRs()) {
           BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
                   TII->get(AMDGPU::S_NOP))
@@ -2939,7 +3811,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
       }
     }
   }
-  ReleaseVGPRInsts.clear();
+
+  CallInsts.clear();
+  ReturnInsts.clear();
+  EndPgmInsts.clear();
   PreheadersToFlush.clear();
   SLoadAddresses.clear();
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d516330..24aa31a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies(
   cl::ReallyHidden);
 
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
-    : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+    : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
+                         AMDGPU::ADJCALLSTACKDOWN),
       RI(ST), ST(ST) {
   SchedModel.init(&ST);
 }
@@ -161,7 +162,7 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
     if (!DstReg.isVirtual())
       return true;
 
-    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
     for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
       switch (Use.getOpcode()) {
       case AMDGPU::S_AND_SAVEEXEC_B32:
@@ -179,6 +180,10 @@ bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
     return false;
   }
 
+  // If it is not convergent it does not depend on EXEC.
+  if (!MI.isConvergent())
+    return false;
+
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1154,7 +1159,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
-  int NewOpc;
+  int64_t NewOpc;
 
   // Try to map original to commuted opcode
   NewOpc = AMDGPU::getCommuteRev(Opcode);
@@ -1325,7 +1330,8 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
   case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
   case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
   case AMDGPU::S_MOV_B64_IMM_PSEUDO:
-  case AMDGPU::V_MOV_B64_PSEUDO: {
+  case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::V_MOV_B16_t16_e32: {
     const MachineOperand &Src0 = MI.getOperand(1);
     if (Src0.isImm()) {
       ImmVal = Src0.getImm();
@@ -1334,6 +1340,15 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
 
     return false;
   }
+  case AMDGPU::V_MOV_B16_t16_e64: {
+    const MachineOperand &Src0 = MI.getOperand(2);
+    if (Src0.isImm() && !MI.getOperand(1).getImm()) {
+      ImmVal = Src0.getImm();
+      return MI.getOperand(0).getReg() == Reg;
+    }
+
+    return false;
+  }
   case AMDGPU::S_BREV_B32:
   case AMDGPU::V_BFREV_B32_e32:
   case AMDGPU::V_BFREV_B32_e64: {
@@ -1361,6 +1376,24 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
   }
 }
 
+std::optional<int64_t>
+SIInstrInfo::getImmOrMaterializedImm(MachineOperand &Op) const {
+  if (Op.isImm())
+    return Op.getImm();
+
+  if (!Op.isReg() || !Op.getReg().isVirtual())
+    return std::nullopt;
+  MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
+  const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+  if (Def && Def->isMoveImmediate()) {
+    const MachineOperand &ImmSrc = Def->getOperand(1);
+    if (ImmSrc.isImm())
+      return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
+  }
+
+  return std::nullopt;
+}
+
 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
   if (RI.isAGPRClass(DstRC))
@@ -1393,6 +1426,10 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
     if (VecSize <= 160) // 20 bytes
       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
+    if (VecSize <= 192) // 24 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
+    if (VecSize <= 224) // 28 bytes
+      return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
     if (VecSize <= 256) // 32 bytes
       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
     if (VecSize <= 288) // 36 bytes
@@ -1421,6 +1458,10 @@ SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
   if (VecSize <= 160) // 20 bytes
     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
+  if (VecSize <= 192) // 24 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
+  if (VecSize <= 224) // 28 bytes
+    return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
   if (VecSize <= 256) // 32 bytes
     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
   if (VecSize <= 288) // 36 bytes
@@ -1450,6 +1491,10 @@ static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
   if (VecSize <= 160) // 20 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
+  if (VecSize <= 192) // 24 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
+  if (VecSize <= 224) // 28 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
   if (VecSize <= 256) // 32 bytes
     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
   if (VecSize <= 288) // 36 bytes
@@ -1479,6 +1524,10 @@ static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
   if (VecSize <= 160) // 20 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
+  if (VecSize <= 192) // 24 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
+  if (VecSize <= 224) // 28 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
   if (VecSize <= 256) // 32 bytes
     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
   if (VecSize <= 288) // 36 bytes
@@ -1667,8 +1716,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
 
 void SIInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -1680,7 +1728,7 @@ void SIInstrInfo::storeRegToStackSlot(
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
       FrameInfo.getObjectAlign(FrameIndex));
-  unsigned SpillSize = TRI->getSpillSize(*RC);
+  unsigned SpillSize = RI.getSpillSize(*RC);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   if (RI.isSGPRClass(RC)) {
@@ -1862,14 +1910,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
                                        Register DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI,
-                                       Register VReg,
+                                       Register VReg, unsigned SubReg,
                                        MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const DebugLoc &DL = MBB.findDebugLoc(MI);
-  unsigned SpillSize = TRI->getSpillSize(*RC);
+  unsigned SpillSize = RI.getSpillSize(*RC);
 
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -1955,17 +2002,15 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
   constexpr unsigned ECQueueWaveAbort = 0x400;
 
   MachineBasicBlock *TrapBB = &MBB;
-  MachineBasicBlock *ContBB = &MBB;
   MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
 
   if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
-    ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
+    MBB.splitAt(MI, /*UpdateLiveIns=*/false);
     TrapBB = MF->CreateMachineBasicBlock();
     BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
     MF->push_back(TrapBB);
     MBB.addSuccessor(TrapBB);
   }
-
   // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
   // will be a nop.
   BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
@@ -2001,7 +2046,7 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
   MF->push_back(HaltLoopBB);
   HaltLoopBB->addSuccessor(HaltLoopBB);
 
-  return ContBB;
+  return MBB.getNextNode();
 }
 
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
@@ -2132,11 +2177,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
 
+    const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
+    const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
+
     const MachineOperand &SrcOp = MI.getOperand(1);
     // FIXME: Will this work for 64-bit floating point immediates?
     assert(!SrcOp.isFPImm());
-    if (ST.hasMovB64()) {
-      MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+    if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
+      MI.setDesc(Mov64Desc);
       if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
           isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
         break;
@@ -2145,17 +2193,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       APInt Imm(64, SrcOp.getImm());
       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
       APInt Hi(32, Imm.getHiBits(32).getZExtValue());
-      if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
-          .addImm(SISrcMods::OP_SEL_1)
-          .addImm(Lo.getSExtValue())
-          .addImm(SISrcMods::OP_SEL_1)
-          .addImm(Lo.getSExtValue())
-          .addImm(0)  // op_sel_lo
-          .addImm(0)  // op_sel_hi
-          .addImm(0)  // neg_lo
-          .addImm(0)  // neg_hi
-          .addImm(0); // clamp
+      const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
+      const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
+
+      if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
+          PkMovRC->contains(Dst)) {
+        BuildMI(MBB, MI, DL, PkMovDesc, Dst)
+            .addImm(SISrcMods::OP_SEL_1)
+            .addImm(Lo.getSExtValue())
+            .addImm(SISrcMods::OP_SEL_1)
+            .addImm(Lo.getSExtValue())
+            .addImm(0)  // op_sel_lo
+            .addImm(0)  // op_sel_hi
+            .addImm(0)  // neg_lo
+            .addImm(0)  // neg_hi
+            .addImm(0); // clamp
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
           .addImm(Lo.getSExtValue())
@@ -2241,6 +2293,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
+  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
@@ -2253,6 +2307,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
+  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
@@ -2282,11 +2338,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     assert(VecReg == MI.getOperand(1).getReg());
 
     MachineInstrBuilder MIB =
-      BuildMI(MBB, MI, DL, OpDesc)
-        .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
-        .add(MI.getOperand(2))
-        .addReg(VecReg, RegState::ImplicitDefine)
-        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+        BuildMI(MBB, MI, DL, OpDesc)
+            .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+            .add(MI.getOperand(2))
+            .addReg(VecReg, RegState::ImplicitDefine)
+            .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
 
     const int ImpDefIdx =
         OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
@@ -2300,6 +2356,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
+  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
@@ -2324,8 +2382,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
             .add(MI.getOperand(2))
             .addReg(VecReg, RegState::ImplicitDefine)
-            .addReg(VecReg,
-                    RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+            .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
 
     const int ImpDefIdx =
         OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
@@ -2344,6 +2401,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
+  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
@@ -2355,18 +2414,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     Register Dst = MI.getOperand(0).getReg();
     Register VecReg = MI.getOperand(1).getReg();
     bool IsUndef = MI.getOperand(1).isUndef();
-    Register Idx = MI.getOperand(2).getReg();
     Register SubReg = MI.getOperand(3).getImm();
 
     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
-                              .addReg(Idx)
+                              .add(MI.getOperand(2))
                               .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
     SetOn->getOperand(3).setIsUndef();
 
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
         .addDef(Dst)
         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
-        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+        .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
 
     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
 
@@ -2500,7 +2558,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     }
     break;
 
-  case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+  case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
     assert(ST.hasBF16PackedInsts());
     MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
     MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
@@ -2513,13 +2571,46 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     break;
   }
 
+  case AMDGPU::GET_STACK_BASE:
+    // The stack starts at offset 0 unless we need to reserve some space at the
+    // bottom.
+    if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
+      // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
+      // some of the VGPRs. The size of the required scratch space has already
+      // been computed by prolog epilog insertion.
+      const SIMachineFunctionInfo *MFI =
+          MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+      unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
+      Register DestReg = MI.getOperand(0).getReg();
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
+          .addImm(AMDGPU::Hwreg::HwregEncoding::encode(
+              AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
+      // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
+      // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
+      // SCC, so we need to check for 0 manually.
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
+      // Change the implicif-def of SCC to an explicit use (but first remove
+      // the dead flag if present).
+      MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
+      MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
+      MI.setDesc(get(AMDGPU::S_CMOVK_I32));
+      MI.addOperand(MachineOperand::CreateImm(VGPRSize));
+    } else {
+      MI.setDesc(get(AMDGPU::S_MOV_B32));
+      MI.addOperand(MachineOperand::CreateImm(0));
+      MI.removeOperand(
+          MI.getNumExplicitOperands()); // Drop implicit def of SCC.
+    }
+    break;
+  }
+
   return true;
 }
 
 void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I, Register DestReg,
-                                unsigned SubIdx, const MachineInstr &Orig,
-                                const TargetRegisterInfo &RI) const {
+                                unsigned SubIdx,
+                                const MachineInstr &Orig) const {
 
   // Try shrinking the instruction to remat only the part needed for current
   // context.
@@ -2569,7 +2660,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
     const MCInstrDesc &TID = get(NewOpcode);
     const TargetRegisterClass *NewRC =
-        RI.getAllocatableClass(getRegClass(TID, 0, &RI));
+        RI.getAllocatableClass(getRegClass(TID, 0));
     MRI.setRegClass(DestReg, NewRC);
 
     UseMO->setReg(DestReg);
@@ -2599,7 +2690,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
     break;
   }
 
-  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
 }
 
 std::pair<MachineInstr*, MachineInstr*>
@@ -2644,7 +2735,7 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
         if (Src.isPhysical())
           MovDPP.addReg(RI.getSubReg(Src, Sub));
         else
-          MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
+          MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
       }
     }
 
@@ -2907,7 +2998,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   auto I = MBB.end();
   auto &MCCtx = MF->getContext();
 
-  if (ST.hasAddPC64Inst()) {
+  if (ST.useAddPC64Inst()) {
     MCSymbol *Offset =
         MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
     auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
@@ -2935,7 +3026,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
     if (FlushSGPRWrites)
       BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
-          .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+          .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
   };
 
   // We need to compute the offset relative to the instruction immediately after
@@ -2953,11 +3044,11 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
       MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
   BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
-      .addReg(PCReg, 0, AMDGPU::sub0)
+      .addReg(PCReg, {}, AMDGPU::sub0)
       .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
   BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
-      .addReg(PCReg, 0, AMDGPU::sub1)
+      .addReg(PCReg, {}, AMDGPU::sub1)
       .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
   ApplyHazardWorkarounds();
 
@@ -3377,15 +3468,13 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
 
     MachineInstr *Select;
     if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
-      Select =
-        BuildMI(MBB, I, DL, get(SelOp), DstElt)
-        .addReg(FalseReg, 0, SubIdx)
-        .addReg(TrueReg, 0, SubIdx);
+      Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
+                   .addReg(FalseReg, {}, SubIdx)
+                   .addReg(TrueReg, {}, SubIdx);
     } else {
-      Select =
-        BuildMI(MBB, I, DL, get(SelOp), DstElt)
-        .addReg(TrueReg, 0, SubIdx)
-        .addReg(FalseReg, 0, SubIdx);
+      Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
+                   .addReg(TrueReg, {}, SubIdx)
+                   .addReg(FalseReg, {}, SubIdx);
     }
 
     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
@@ -3461,6 +3550,21 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const {
   }
 }
 
+void SIInstrInfo::mutateAndCleanupImplicit(MachineInstr &MI,
+                                           const MCInstrDesc &NewDesc) const {
+  MI.setDesc(NewDesc);
+
+  // Remove any leftover implicit operands from mutating the instruction. e.g.
+  // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
+  // anymore.
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
+                    Desc.implicit_defs().size();
+
+  for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
+    MI.removeOperand(I);
+}
+
 std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
                                                          unsigned SubRegIndex) {
   switch (SubRegIndex) {
@@ -3503,6 +3607,8 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
   case AMDGPU::V_FMAC_F16_e64:
   case AMDGPU::V_FMAC_F16_t16_e64:
   case AMDGPU::V_FMAC_F16_fake16_e64:
+  case AMDGPU::V_FMAC_F16_t16_e32:
+  case AMDGPU::V_FMAC_F16_fake16_e32:
   case AMDGPU::V_FMA_F16_e64:
     return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
                                         ? AMDGPU::V_FMAAK_F16_t16
@@ -3535,6 +3641,8 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
   case AMDGPU::V_FMAC_F16_e64:
   case AMDGPU::V_FMAC_F16_t16_e64:
   case AMDGPU::V_FMAC_F16_fake16_e64:
+  case AMDGPU::V_FMAC_F16_t16_e32:
+  case AMDGPU::V_FMAC_F16_fake16_e32:
   case AMDGPU::V_FMA_F16_e64:
     return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
                                         ? AMDGPU::V_FMAMK_F16_t16
@@ -3612,7 +3720,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
           AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
       const MCInstrDesc &MovDesc = get(MovOp);
 
-      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
+      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
       if (Is16Bit) {
         // We just need to find a correctly sized register class, so the
         // subregister index compatibility doesn't matter since we're statically
@@ -3703,6 +3811,23 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
+    auto CopyRegOperandToNarrowerRC =
+        [MRI, this](MachineInstr &MI, unsigned OpNo,
+                    const TargetRegisterClass *NewRC) -> void {
+      if (!MI.getOperand(OpNo).isReg())
+        return;
+      Register Reg = MI.getOperand(OpNo).getReg();
+      const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
+      if (RI.getCommonSubClass(RC, NewRC) != NewRC)
+        return;
+      Register Tmp = MRI->createVirtualRegister(NewRC);
+      BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+              get(AMDGPU::COPY), Tmp)
+          .addReg(Reg);
+      MI.getOperand(OpNo).setReg(Tmp);
+      MI.getOperand(OpNo).setIsKill();
+    };
+
     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
     if ((Src0->isReg() && Src0->getReg() == Reg) ||
         (Src1->isReg() && Src1->getReg() == Reg)) {
@@ -3734,13 +3859,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       if (pseudoToMCOpcode(NewOpc) == -1)
         return false;
 
-      // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
-      // takes VGPR_32_Lo128 operands, so the rewrite would also require
-      // restricting their register classes. For now just bail out.
-      if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
-          NewOpc == AMDGPU::V_FMAMK_F16_fake16)
-        return false;
-
       const std::optional<int64_t> SubRegImm = extractSubregFromImm(
           Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
 
@@ -3765,6 +3883,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       removeModOperands(UseMI);
       UseMI.setDesc(get(NewOpc));
 
+      if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
+          NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
+        const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
+        Register Tmp = MRI->createVirtualRegister(NewRC);
+        BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
+                UseMI.getDebugLoc(), get(AMDGPU::COPY),
+                UseMI.getOperand(0).getReg())
+            .addReg(Tmp, RegState::Kill);
+        UseMI.getOperand(0).setReg(Tmp);
+        CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
+        CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
+      }
+
       bool DeleteDef = MRI->use_nodbg_empty(Reg);
       if (DeleteDef)
         DefMI.eraseFromParent();
@@ -3812,13 +3943,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       if (pseudoToMCOpcode(NewOpc) == -1)
         return false;
 
-      // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
-      // takes VGPR_32_Lo128 operands, so the rewrite would also require
-      // restricting their register classes. For now just bail out.
-      if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
-          NewOpc == AMDGPU::V_FMAAK_F16_fake16)
-        return false;
-
       // FIXME: This would be a lot easier if we could return a new instruction
       // instead of having to modify in place.
 
@@ -3838,6 +3962,20 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       // These come before src2.
       removeModOperands(UseMI);
       UseMI.setDesc(get(NewOpc));
+
+      if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
+          NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
+        const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
+        Register Tmp = MRI->createVirtualRegister(NewRC);
+        BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
+                UseMI.getDebugLoc(), get(AMDGPU::COPY),
+                UseMI.getOperand(0).getReg())
+            .addReg(Tmp, RegState::Kill);
+        UseMI.getOperand(0).setReg(Tmp);
+        CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
+        CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
+      }
+
       // It might happen that UseMI was commuted
       // and we now have SGPR as SRC1. If so 2 inlined
       // constant and SGPR are illegal.
@@ -3917,6 +4055,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   if (isLDSDMA(MIa) || isLDSDMA(MIb))
     return false;
 
+  if (MIa.isBundle() || MIb.isBundle())
+    return false;
+
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
   // underlying address space, even if it was lowered to a different one,
@@ -3982,7 +4123,7 @@ static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
                            MachineInstr **DefMI = nullptr) {
   if (!MO->isReg())
     return false;
-  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
+  const MachineFunction *MF = MO->getParent()->getMF();
   const MachineRegisterInfo &MRI = MF->getRegInfo();
   return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
 }
@@ -4032,28 +4173,50 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
   }
 }
 
+/// Helper struct for the implementation of 3-address conversion to communicate
+/// updates made to instruction operands.
+struct SIInstrInfo::ThreeAddressUpdates {
+  /// Other instruction whose def is no longer used by the converted
+  /// instruction.
+  MachineInstr *RemoveMIUse = nullptr;
+};
+
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                                  LiveVariables *LV,
                                                  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
-  unsigned Opc = MI.getOpcode();
+  MachineInstr *CandidateMI = &MI;
 
-  // Handle MFMA.
-  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
-  if (NewMFMAOpc != -1) {
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
-      MIB.add(MI.getOperand(I));
-    updateLiveVariables(LV, MI, *MIB);
+  if (MI.isBundle()) {
+    // This is a temporary placeholder for bundle handling that enables us to
+    // exercise the relevant code paths in the two-address instruction pass.
+    if (MI.getBundleSize() != 1)
+      return nullptr;
+    CandidateMI = MI.getNextNode();
+  }
+
+  ThreeAddressUpdates U;
+  MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
+  if (!NewMI)
+    return nullptr;
+
+  if (MI.isBundle()) {
+    CandidateMI->eraseFromBundle();
+
+    for (MachineOperand &MO : MI.all_defs()) {
+      if (MO.isTied())
+        MI.untieRegOperand(MO.getOperandNo());
+    }
+  } else {
+    updateLiveVariables(LV, MI, *NewMI);
     if (LIS) {
-      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+      LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
       // SlotIndex of defs needs to be updated when converting to early-clobber
-      MachineOperand &Def = MIB->getOperand(0);
+      MachineOperand &Def = NewMI->getOperand(0);
       if (Def.isEarlyClobber() && Def.isReg() &&
           LIS->hasInterval(Def.getReg())) {
-        SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
-        SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
+        SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
+        SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
         auto &LI = LIS->getInterval(Def.getReg());
         auto UpdateDefIndex = [&](LiveRange &LR) {
           auto *S = LR.find(OldIndex);
@@ -4068,6 +4231,88 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
           UpdateDefIndex(SR);
       }
     }
+  }
+
+  if (U.RemoveMIUse) {
+    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    // The only user is the instruction which will be killed.
+    Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
+
+    if (MRI.hasOneNonDBGUse(DefReg)) {
+      // We cannot just remove the DefMI here, calling pass will crash.
+      U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
+      U.RemoveMIUse->getOperand(0).setIsDead(true);
+      for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
+        U.RemoveMIUse->removeOperand(I);
+      if (LV)
+        LV->getVarInfo(DefReg).AliveBlocks.clear();
+    }
+
+    if (MI.isBundle()) {
+      VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+      if (!VRI.Reads && !VRI.Writes) {
+        for (MachineOperand &MO : MI.all_uses()) {
+          if (MO.isReg() && MO.getReg() == DefReg) {
+            assert(MO.getSubReg() == 0 &&
+                   "tied sub-registers in bundles currently not supported");
+            MI.removeOperand(MO.getOperandNo());
+            break;
+          }
+        }
+
+        if (LIS)
+          LIS->shrinkToUses(&LIS->getInterval(DefReg));
+      }
+    } else if (LIS) {
+      LiveInterval &DefLI = LIS->getInterval(DefReg);
+
+      // We cannot delete the original instruction here, so hack out the use
+      // in the original instruction with a dummy register so we can use
+      // shrinkToUses to deal with any multi-use edge cases. Other targets do
+      // not have the complexity of deleting a use to consider here.
+      Register DummyReg = MRI.cloneVirtualRegister(DefReg);
+      for (MachineOperand &MIOp : MI.uses()) {
+        if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+          MIOp.setIsUndef(true);
+          MIOp.setReg(DummyReg);
+        }
+      }
+
+      if (MI.isBundle()) {
+        VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
+        if (!VRI.Reads && !VRI.Writes) {
+          for (MachineOperand &MIOp : MI.uses()) {
+            if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+              MIOp.setIsUndef(true);
+              MIOp.setReg(DummyReg);
+            }
+          }
+        }
+
+        MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
+                                                false, /*isUndef=*/true));
+      }
+
+      LIS->shrinkToUses(&DefLI);
+    }
+  }
+
+  return MI.isBundle() ? &MI : NewMI;
+}
+
+MachineInstr *
+SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
+                                       ThreeAddressUpdates &U) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  unsigned Opc = MI.getOpcode();
+
+  // Handle MFMA.
+  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+  if (NewMFMAOpc != -1) {
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+    for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
+      MIB.add(MI.getOperand(I));
     return MIB;
   }
 
@@ -4075,13 +4320,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
     unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
     MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                                   .setMIFlags(MI.getFlags());
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+    for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
       MIB->addOperand(MI.getOperand(I));
-
-    updateLiveVariables(LV, MI, *MIB);
-    if (LIS)
-      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-
     return MIB;
   }
 
@@ -4152,39 +4392,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
     MachineInstr *DefMI;
-    const auto killDef = [&]() -> void {
-      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      // The only user is the instruction which will be killed.
-      Register DefReg = DefMI->getOperand(0).getReg();
-
-      if (MRI.hasOneNonDBGUse(DefReg)) {
-        // We cannot just remove the DefMI here, calling pass will crash.
-        DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
-        DefMI->getOperand(0).setIsDead(true);
-        for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
-          DefMI->removeOperand(I);
-        if (LV)
-          LV->getVarInfo(DefReg).AliveBlocks.clear();
-      }
-
-      if (LIS) {
-        LiveInterval &DefLI = LIS->getInterval(DefReg);
-
-        // We cannot delete the original instruction here, so hack out the use
-        // in the original instruction with a dummy register so we can use
-        // shrinkToUses to deal with any multi-use edge cases. Other targets do
-        // not have the complexity of deleting a use to consider here.
-        Register DummyReg = MRI.cloneVirtualRegister(DefReg);
-        for (MachineOperand &MIOp : MI.uses()) {
-          if (MIOp.isReg() && MIOp.getReg() == DefReg) {
-            MIOp.setIsUndef(true);
-            MIOp.setReg(DummyReg);
-          }
-        }
-
-        LIS->shrinkToUses(&DefLI);
-      }
-    };
 
     int64_t Imm;
     if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
@@ -4196,10 +4403,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .add(*Src1)
                   .addImm(Imm)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4212,11 +4416,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .addImm(Imm)
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4235,12 +4435,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .addImm(Imm)
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        if (DefMI)
-          killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4269,9 +4464,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
             .setMIFlags(MI.getFlags());
   if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
     MIB.addImm(OpSel ? OpSel->getImm() : 0);
-  updateLiveVariables(LV, MI, *MIB);
-  if (LIS)
-    LIS->ReplaceMachineInstrInMaps(MI, *MIB);
   return MIB;
 }
 
@@ -4321,24 +4513,26 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
          changesVGPRIndexingMode(MI);
 }
 
-bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
+bool SIInstrInfo::isAlwaysGDS(uint32_t Opcode) const {
   return Opcode == AMDGPU::DS_ORDERED_COUNT ||
          Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
          Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
 }
 
-bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
-  if (!isFLAT(MI) || isFLATGlobal(MI))
-    return false;
-
-  // If scratch is not initialized, we can never access it.
-  if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+bool SIInstrInfo::mayAccessScratch(const MachineInstr &MI) const {
+  // Instructions that access scratch use FLAT encoding or BUF encodings.
+  if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
     return false;
 
   // SCRATCH instructions always access scratch.
   if (isFLATScratch(MI))
     return true;
 
+  // If FLAT_SCRATCH registers are not initialized, we can never access scratch
+  // via the aperture.
+  if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+    return false;
+
   // If there are no memory operands then conservatively assume the flat
   // operation may access scratch.
   if (MI.memoperands_empty())
@@ -4569,6 +4763,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return AMDGPU::isInlinableLiteralV2F16(Imm);
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+    return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
   case AMDGPU::OPERAND_REG_IMM_V2BF16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
     return AMDGPU::isInlinableLiteralV2BF16(Imm);
@@ -4945,8 +5141,8 @@ bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
 
 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
                                     StringRef &ErrInfo) const {
-  uint16_t Opcode = MI.getOpcode();
-  const MachineFunction *MF = MI.getParent()->getParent();
+  uint32_t Opcode = MI.getOpcode();
+  const MachineFunction *MF = MI.getMF();
   const MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // FIXME: At this point the COPY verify is done only for non-ssa forms.
@@ -5036,6 +5232,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     case AMDGPU::OPERAND_REG_IMM_FP16:
     case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
     case AMDGPU::OPERAND_REG_IMM_V2INT32:
     case AMDGPU::OPERAND_REG_IMM_V2BF16:
@@ -5104,7 +5301,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     // aligned register constraint.
     // FIXME: We do not verify inline asm operands, but custom inline asm
     // verification is broken anyway
-    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
+    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
+        Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
       if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
         if (const TargetRegisterClass *SubRC =
@@ -5200,7 +5398,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
     }
 
-    uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+    uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
     if (isVOPC(BasicOpcode)) {
       if (!ST.hasSDWASdst() && DstIdx != -1) {
         // Only vcc allowed as dst on VI for VOPC
@@ -5450,9 +5648,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         Desc.getNumOperands() + Desc.implicit_uses().size();
     const unsigned NumImplicitOps = IsDst ? 2 : 1;
 
-    // Allow additional implicit operands. This allows a fixup done by the post
-    // RA scheduler where the main implicit operand is killed and implicit-defs
-    // are added for sub-registers that remain live after this instruction.
+    // Require additional implicit operands. This allows a fixup done by the
+    // post RA scheduler where the main implicit operand is killed and
+    // implicit-defs are added for sub-registers that remain live after this
+    // instruction.
     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
       ErrInfo = "missing implicit register operands";
       return false;
@@ -5734,6 +5933,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
+      MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
+    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
+    if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
+                    &AMDGPU::SReg_64RegClass) ||
+        Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
+      ErrInfo = "Instruction cannot read flat_scratch_base_hi";
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -5752,17 +5962,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
   case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
   case AMDGPU::S_MOV_B32: {
-    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
     return MI.getOperand(1).isReg() ||
            RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
   }
   case AMDGPU::S_ADD_I32:
-    return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+    return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
   case AMDGPU::S_ADDC_U32:
     return AMDGPU::V_ADDC_U32_e32;
   case AMDGPU::S_SUB_I32:
-    return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
+    return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
     // FIXME: These are not consistently handled, and selected when the carry is
     // used.
   case AMDGPU::S_ADD_U32:
@@ -6019,19 +6229,6 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
   llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
 }
 
-// FIXME: This should not be an overridable function. All subtarget dependent
-// operand modifications should go through isLookupRegClassByHwMode in the
-// generic handling.
-const TargetRegisterClass *
-SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-                         const TargetRegisterInfo *TRI) const {
-  if (OpNum >= TID.getNumOperands())
-    return nullptr;
-  const MCOperandInfo &OpInfo = TID.operands()[OpNum];
-  int16_t RegClass = getOpRegClassID(OpInfo);
-  return RI.getRegClass(RegClass);
-}
-
 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
                                                       unsigned OpNo) const {
   const MCInstrDesc &Desc = get(MI.getOpcode());
@@ -6040,14 +6237,14 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
     Register Reg = MI.getOperand(OpNo).getReg();
 
     if (Reg.isVirtual()) {
-      const MachineRegisterInfo &MRI =
-          MI.getParent()->getParent()->getRegInfo();
+      const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
       return MRI.getRegClass(Reg);
     }
     return RI.getPhysRegBaseClass(Reg);
   }
 
-  return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
+  int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
+  return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
 }
 
 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
@@ -6086,7 +6283,7 @@ unsigned SIInstrInfo::buildExtractSubReg(
 
   unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
-      .addReg(SuperReg.getReg(), 0, NewSubIdx);
+      .addReg(SuperReg.getReg(), {}, NewSubIdx);
   return SubReg;
 }
 
@@ -6131,7 +6328,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
 
   if (MO.getSubReg()) {
-    const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+    const MachineFunction *MF = MO.getParent()->getMF();
     const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
     if (!SuperRC)
       return false;
@@ -6143,7 +6340,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
 
 bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
                                     const MachineOperand &MO) const {
-  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
   unsigned Opc = MI.getOpcode();
 
@@ -6151,7 +6348,7 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
   // information.
   if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
       MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
-    constexpr const AMDGPU::OpName OpNames[] = {
+    constexpr AMDGPU::OpName OpNames[] = {
         AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
 
     for (auto [I, OpName] : enumerate(OpNames)) {
@@ -6196,6 +6393,18 @@ bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
       (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
       RI.isSGPRReg(MRI, MO.getReg()))
     return false;
+
+  if (ST.hasFlatScratchHiInB64InstHazard() &&
+      MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
+    if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+      if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
+          64)
+        return false;
+    }
+    if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
+      return false;
+  }
+
   return true;
 }
 
@@ -6213,8 +6422,8 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
 bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
     const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
     const MachineOperand *MO) const {
-  constexpr const unsigned NumOps = 3;
-  constexpr const AMDGPU::OpName OpNames[NumOps * 2] = {
+  constexpr unsigned NumOps = 3;
+  constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
       AMDGPU::OpName::src0,           AMDGPU::OpName::src1,
       AMDGPU::OpName::src2,           AMDGPU::OpName::src0_modifiers,
       AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
@@ -6245,7 +6454,7 @@ bool SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand(
 
 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
-  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const MCInstrDesc &InstDesc = MI.getDesc();
   const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
@@ -6670,7 +6879,7 @@ Register SIInstrInfo::readlaneVGPRToSGPR(
     Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
-        .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+        .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
     SRegs.push_back(SGPR);
   }
 
@@ -6799,7 +7008,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
     return;
 
   const TargetRegisterClass *DeclaredRC =
-      getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
+      getRegClass(MI.getDesc(), SAddr->getOperandNo());
 
   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
   SAddr->setReg(ToSGPR);
@@ -6898,7 +7107,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
       ScalarOp->setIsKill();
     } else {
       SmallVector<Register, 8> ReadlanePieces;
-      unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
+      RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
       assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
              "Unhandled register size");
 
@@ -7141,7 +7350,7 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
 MachineBasicBlock *
 SIInstrInfo::legalizeOperands(MachineInstr &MI,
                               MachineDominatorTree *MDT) const {
-  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFunction &MF = *MI.getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineBasicBlock *CreatedBB = nullptr;
 
@@ -7169,44 +7378,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     return CreatedBB;
   }
 
-  // Legalize REG_SEQUENCE and PHI
-  // The register class of the operands much be the same type as the register
+  // Legalize PHI
+  // The register class of the operands must be the same type as the register
   // class of the output.
   if (MI.getOpcode() == AMDGPU::PHI) {
-    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
-    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
-      if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
-        continue;
-      const TargetRegisterClass *OpRC =
-          MRI.getRegClass(MI.getOperand(i).getReg());
-      if (RI.hasVectorRegisters(OpRC)) {
-        VRC = OpRC;
-      } else {
-        SRC = OpRC;
-      }
-    }
-
-    // If any of the operands are VGPR registers, then they all most be
-    // otherwise we will create illegal VGPR->SGPR copies when legalizing
-    // them.
-    if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
-      if (!VRC) {
-        assert(SRC);
-        if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
-          VRC = &AMDGPU::VReg_1RegClass;
-        } else
-          VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
-                    ? RI.getEquivalentAGPRClass(SRC)
-                    : RI.getEquivalentVGPRClass(SRC);
-      } else {
-        VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
-                  ? RI.getEquivalentAGPRClass(VRC)
-                  : RI.getEquivalentVGPRClass(VRC);
-      }
-      RC = VRC;
-    } else {
-      RC = SRC;
-    }
+    const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
+    assert(!RI.isSGPRClass(VRC));
 
     // Update all the operands so they have the same type.
     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
@@ -7220,7 +7397,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
 
       // Avoid creating no-op copies with the same src and dst reg class.  These
       // confuse some of the machine passes.
-      legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
+      legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
     }
   }
 
@@ -7426,18 +7603,18 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
       const DebugLoc &DL = MI.getDebugLoc();
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
-        .addDef(CondReg0)
-        .addReg(RsrcPtr, 0, AMDGPU::sub0)
-        .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
-        .addImm(0);
+          .addDef(CondReg0)
+          .addReg(RsrcPtr, {}, AMDGPU::sub0)
+          .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
+          .addImm(0);
 
       // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
-        .addDef(CondReg1, RegState::Dead)
-        .addReg(RsrcPtr, 0, AMDGPU::sub1)
-        .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
-        .addReg(CondReg0, RegState::Kill)
-        .addImm(0);
+          .addDef(CondReg1, RegState::Dead)
+          .addReg(RsrcPtr, {}, AMDGPU::sub1)
+          .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
+          .addReg(CondReg0, RegState::Kill)
+          .addImm(0);
 
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
@@ -7510,9 +7687,9 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       // NewVaddr = {NewVaddrHi, NewVaddrLo}
       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
               NewVAddr)
-          .addReg(RsrcPtr, 0, AMDGPU::sub0)
+          .addReg(RsrcPtr, {}, AMDGPU::sub0)
           .addImm(AMDGPU::sub0)
-          .addReg(RsrcPtr, 0, AMDGPU::sub1)
+          .addReg(RsrcPtr, {}, AMDGPU::sub1)
           .addImm(AMDGPU::sub1);
     } else {
       // Legalize a VGPR Rsrc and soffset together.
@@ -7630,6 +7807,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   unsigned Opcode = Inst.getOpcode();
   unsigned NewOpcode = getVALUOp(Inst);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
   // Handle some special cases
   switch (Opcode) {
   default:
@@ -7781,6 +7960,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Inst.eraseFromParent();
     return;
 
+  case AMDGPU::S_ABSDIFF_I32:
+    lowerScalarAbsDiff(Worklist, Inst);
+    Inst.eraseFromParent();
+    return;
+
   case AMDGPU::S_CBRANCH_SCC0:
   case AMDGPU::S_CBRANCH_SCC1: {
     // Clear unused bits of vcc
@@ -7867,7 +8051,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     return;
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     MachineOperand &Dest0 = Inst.getOperand(0);
     MachineOperand &Dest1 = Inst.getOperand(1);
     MachineOperand &Src0 = Inst.getOperand(2);
@@ -7887,12 +8070,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
 
     legalizeOperands(*NewInstr, MDT);
     MRI.replaceRegWith(Dest0.getReg(), DestReg);
-    addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
-                                 Worklist);
+    addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
     Inst.eraseFromParent();
   }
     return;
+  case AMDGPU::S_LSHL1_ADD_U32:
+  case AMDGPU::S_LSHL2_ADD_U32:
+  case AMDGPU::S_LSHL3_ADD_U32:
+  case AMDGPU::S_LSHL4_ADD_U32: {
+    MachineOperand &Dest = Inst.getOperand(0);
+    MachineOperand &Src0 = Inst.getOperand(1);
+    MachineOperand &Src1 = Inst.getOperand(2);
+    unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32   ? 1
+                         : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+                         : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+                                                             : 4);
+
+    const TargetRegisterClass *NewRC =
+        RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+    Register DestReg = MRI.createVirtualRegister(NewRC);
+    MachineInstr *NewInstr =
+        BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+            .add(Src0)
+            .addImm(ShiftAmt)
+            .add(Src1);
 
+    legalizeOperands(*NewInstr, MDT);
+    MRI.replaceRegWith(Dest.getReg(), DestReg);
+    addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+    Inst.eraseFromParent();
+  }
+    return;
   case AMDGPU::S_CSELECT_B32:
   case AMDGPU::S_CSELECT_B64:
     lowerSelect(Worklist, Inst, MDT);
@@ -7943,7 +8151,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     }
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
@@ -7983,13 +8191,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     legalizeOperandsVALUt16(*NewInstr, MRI);
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
   }
   case AMDGPU::S_CVT_HI_F32_F16: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     if (ST.useRealTrue16Insts()) {
@@ -7997,7 +8204,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
           .add(Inst.getOperand(1));
       BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
           .addImm(0) // src0_modifiers
-          .addReg(TmpReg, 0, AMDGPU::hi16)
+          .addReg(TmpReg, {}, AMDGPU::hi16)
           .addImm(0)  // clamp
           .addImm(0)  // omod
           .addImm(0); // op_sel0
@@ -8019,7 +8226,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   }
   case AMDGPU::S_MINIMUM_F32:
   case AMDGPU::S_MAXIMUM_F32: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
                                  .addImm(0) // src0_modifiers
@@ -8037,7 +8243,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   }
   case AMDGPU::S_MINIMUM_F16:
   case AMDGPU::S_MAXIMUM_F16: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
                                                     ? &AMDGPU::VGPR_16RegClass
                                                     : &AMDGPU::VGPR_32RegClass);
@@ -8061,7 +8266,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   case AMDGPU::V_S_RCP_F16_e64:
   case AMDGPU::V_S_RSQ_F16_e64:
   case AMDGPU::V_S_SQRT_F16_e64: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
                                                     ? &AMDGPU::VGPR_16RegClass
                                                     : &AMDGPU::VGPR_32RegClass);
@@ -8110,26 +8314,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
       return;
     }
 
-    if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
-        NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
-      // Instead of creating a copy where src and dst are the same register
-      // class, we just replace all uses of dst with src.  These kinds of
-      // copies interfere with the heuristics MachineSink uses to decide
-      // whether or not to split a critical edge.  Since the pass assumes
-      // that copies will end up as machine instructions and not be
-      // eliminated.
-      addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+    if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
       Register NewDstReg = Inst.getOperand(1).getReg();
-      MRI.replaceRegWith(DstReg, NewDstReg);
-      MRI.clearKillFlags(NewDstReg);
-      Inst.getOperand(0).setReg(DstReg);
-      Inst.eraseFromParent();
-      // Legalize t16 operand since replaceReg is called after addUsersToVALU
-      for (MachineOperand &MO :
-           make_early_inc_range(MRI.use_operands(NewDstReg))) {
-        legalizeOperandsVALUt16(*MO.getParent(), MRI);
+      const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
+      if (const TargetRegisterClass *CommonRC =
+              RI.getCommonSubClass(NewDstRC, SrcRC)) {
+        // Instead of creating a copy where src and dst are the same register
+        // class, we just replace all uses of dst with src.  These kinds of
+        // copies interfere with the heuristics MachineSink uses to decide
+        // whether or not to split a critical edge.  Since the pass assumes
+        // that copies will end up as machine instructions and not be
+        // eliminated.
+        addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+        MRI.replaceRegWith(DstReg, NewDstReg);
+        MRI.clearKillFlags(NewDstReg);
+        Inst.getOperand(0).setReg(DstReg);
+
+        if (!MRI.constrainRegClass(NewDstReg, CommonRC))
+          llvm_unreachable("failed to constrain register");
+
+        Inst.eraseFromParent();
+        // Legalize t16 operand since replaceReg is called after addUsersToVALU
+        for (MachineOperand &MO :
+             make_early_inc_range(MRI.use_operands(NewDstReg))) {
+          legalizeOperandsVALUt16(*MO.getParent(), MRI);
+        }
+
+        return;
       }
-      return;
     }
 
     // If this is a v2s copy between 16bit and 32bit reg,
@@ -8181,7 +8393,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
                                    AMDGPU::OpName::src0_modifiers) >= 0)
       NewInstr.addImm(0);
     if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
-      MachineOperand Src = Inst.getOperand(1);
+      const MachineOperand &Src = Inst.getOperand(1);
       NewInstr->addOperand(Src);
     }
 
@@ -8268,7 +8480,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
 std::pair<bool, MachineBasicBlock *>
 SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
                               MachineDominatorTree *MDT) const {
-  if (ST.hasAddNoCarry()) {
+  if (ST.hasAddNoCarryInsts()) {
     // Assume there is no user of scc since we don't select this in that case.
     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
     // is used.
@@ -8307,7 +8519,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src0 = Inst.getOperand(1);
@@ -8388,15 +8600,15 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineBasicBlock::iterator MII = Inst;
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src = Inst.getOperand(1);
   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
-  unsigned SubOp = ST.hasAddNoCarry() ?
-    AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
+  unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
+                                           : AMDGPU::V_SUB_CO_U32_e32;
 
   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
     .addImm(0)
@@ -8410,6 +8622,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
+void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
+                                     MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src1 = Inst.getOperand(1);
+  MachineOperand &Src2 = Inst.getOperand(2);
+  Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
+                                           : AMDGPU::V_SUB_CO_U32_e32;
+
+  BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
+      .addReg(Src1.getReg())
+      .addReg(Src2.getReg());
+
+  BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+      .addReg(SubResultReg)
+      .addReg(TmpReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
 void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
                                   MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
@@ -8541,7 +8784,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
 
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src0 = Inst.getOperand(1);
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineBasicBlock::iterator MII = Inst;
 
@@ -8775,7 +9018,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
   MachineOperand &Dest = Inst.getOperand(0);
   MachineOperand &Src0 = Inst.getOperand(1);
   MachineOperand &Src1 = Inst.getOperand(2);
-  DebugLoc DL = Inst.getDebugLoc();
+  const DebugLoc &DL = Inst.getDebugLoc();
 
   MachineBasicBlock::iterator MII = Inst;
 
@@ -8937,7 +9180,7 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
-        .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
+        .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
         .addImm(0)
         .addImm(BitWidth);
 
@@ -8961,14 +9204,14 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
 
   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
-    .addImm(31)
-    .addReg(Src.getReg(), 0, AMDGPU::sub0);
+      .addImm(31)
+      .addReg(Src.getReg(), {}, AMDGPU::sub0);
 
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
-    .addReg(Src.getReg(), 0, AMDGPU::sub0)
-    .addImm(AMDGPU::sub0)
-    .addReg(TmpReg)
-    .addImm(AMDGPU::sub1);
+      .addReg(Src.getReg(), {}, AMDGPU::sub0)
+      .addImm(AMDGPU::sub0)
+      .addReg(TmpReg)
+      .addImm(AMDGPU::sub1);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
@@ -8993,8 +9236,8 @@ void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
   const MCInstrDesc &InstDesc = get(Opcode);
 
   bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
-  unsigned OpcodeAdd =
-      ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+  unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
+                                               : AMDGPU::V_ADD_CO_U32_e32;
 
   const TargetRegisterClass *SrcRC =
       Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
@@ -9072,6 +9315,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
   MachineOperand &Src1 = Inst.getOperand(2);
   const DebugLoc &DL = Inst.getDebugLoc();
 
+  if (ST.useRealTrue16Insts()) {
+    Register SrcReg0, SrcReg1;
+    if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
+      SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
+    } else {
+      SrcReg0 = Src0.getReg();
+    }
+
+    if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
+      SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
+    } else {
+      SrcReg1 = Src1.getReg();
+    }
+
+    bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
+    bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
+
+    auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
+    switch (Inst.getOpcode()) {
+    case AMDGPU::S_PACK_LL_B32_B16:
+      NewMI
+          .addReg(SrcReg0, {},
+                  isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, {},
+                  isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::hi16);
+      break;
+    case AMDGPU::S_PACK_LH_B32_B16:
+      NewMI
+          .addReg(SrcReg0, {},
+                  isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, {}, AMDGPU::hi16)
+          .addImm(AMDGPU::hi16);
+      break;
+    case AMDGPU::S_PACK_HL_B32_B16:
+      NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, {},
+                  isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::hi16);
+      break;
+    case AMDGPU::S_PACK_HH_B32_B16:
+      NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, {}, AMDGPU::hi16)
+          .addImm(AMDGPU::hi16);
+      break;
+    default:
+      llvm_unreachable("unhandled s_pack_* instruction");
+    }
+
+    MachineOperand &Dest = Inst.getOperand(0);
+    MRI.replaceRegWith(Dest.getReg(), ResultReg);
+    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+    return;
+  }
+
   switch (Inst.getOpcode()) {
   case AMDGPU::S_PACK_LL_B32_B16: {
     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -9136,7 +9440,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                                MachineInstr &SCCDefInst,
                                                SIInstrWorklist &Worklist,
                                                Register NewCond) const {
@@ -9154,7 +9458,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
     int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
     if (SCCIdx != -1) {
       if (MI.isCopy()) {
-        MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+        MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
         Register DestReg = MI.getOperand(0).getReg();
 
         MRI.replaceRegWith(DestReg, NewCond);
@@ -9266,7 +9570,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
     return SGPRReg;
 
   Register UsedSGPRs[3] = {Register()};
-  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
 
   for (unsigned i = 0; i < 3; ++i) {
     int Idx = OpIndices[i];
@@ -9490,7 +9794,14 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
               LiteralSize = 8;
             break;
           case AMDGPU::OPERAND_REG_IMM_INT64:
-            if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
+            // A 32-bit literal is only valid when the value fits in BOTH signed
+            // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
+            // emitter's getLit64Encoding logic. This is because of the lack of
+            // abilility to tell signedness of the literal, therefore we need to
+            // be conservative and assume values outside this range require a
+            // 64-bit literal encoding (8 bytes).
+            if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
+                !isUInt<32>(Op.getImm()))
               LiteralSize = 8;
             break;
           }
@@ -9516,7 +9827,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     return getInstBundleSize(MI);
   case TargetOpcode::INLINEASM:
   case TargetOpcode::INLINEASM_BR: {
-    const MachineFunction *MF = MI.getParent()->getParent();
+    const MachineFunction *MF = MI.getMF();
     const char *AsmStr = MI.getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
   }
@@ -9628,6 +9939,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
           {MONoClobber, "amdgpu-noclobber"},
           {MOLastUse, "amdgpu-last-use"},
           {MOCooperative, "amdgpu-cooperative"},
+          {MOThreadPrivate, "amdgpu-thread-private"},
       };
 
   return ArrayRef(TargetFlags);
@@ -9643,6 +9955,30 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
   return AMDGPU::COPY;
 }
 
+bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
+  uint32_t Opcode = MI.getOpcode();
+  // Check if it is SGPR spill or wwm-register spill Opcode.
+  if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
+    return true;
+
+  const MachineFunction *MF = MI.getMF();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  // See if this is Liverange split instruction inserted for SGPR or
+  // wwm-register. The implicit def inserted for wwm-registers should also be
+  // included as they can appear at the bb begin.
+  bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
+  if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
+    return false;
+
+  Register Reg = MI.getOperand(0).getReg();
+  if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
+    return IsLRSplitInst;
+
+  return MFI->isWWMReg(Reg);
+}
+
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
                                        Register Reg) const {
   // We need to handle instructions which may be inserted during register
@@ -9651,20 +9987,16 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
   // needed by the prolog. However, the insertions for scalar registers can
   // always be placed at the BB top as they are independent of the exec mask
   // value.
-  const MachineFunction *MF = MI.getParent()->getParent();
   bool IsNullOrVectorRegister = true;
   if (Reg) {
+    const MachineFunction *MF = MI.getMF();
     const MachineRegisterInfo &MRI = MF->getRegInfo();
     IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
   }
 
-  uint16_t Opcode = MI.getOpcode();
-  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   return IsNullOrVectorRegister &&
-         (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
-          (Opcode == AMDGPU::IMPLICIT_DEF &&
-           MFI->isWWMReg(MI.getOperand(0).getReg())) ||
-          (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
+         (canAddToBBProlog(MI) ||
+          (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
            MI.modifiesRegister(AMDGPU::EXEC, &RI)));
 }
 
@@ -9673,7 +10005,7 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I,
                            const DebugLoc &DL,
                            Register DestReg) const {
-  if (ST.hasAddNoCarry())
+  if (ST.hasAddNoCarryInsts())
     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
 
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -9689,7 +10021,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
                                                const DebugLoc &DL,
                                                Register DestReg,
                                                RegScavenger &RS) const {
-  if (ST.hasAddNoCarry())
+  if (ST.hasAddNoCarryInsts())
     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
 
   // If available, prefer to use vcc.
@@ -9746,6 +10078,9 @@ void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
   if (MI.isInlineAsm())
     return;
 
+  if (MI.getNumOperands() < MI.getNumExplicitOperands())
+    return;
+
   for (auto &Op : MI.implicit_operands()) {
     if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
       Op.setReg(AMDGPU::VCC_LO);
@@ -9928,6 +10263,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
   case AMDGPUSubtarget::GFX12:
     return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
                                 : SIEncodingFamily::GFX12;
+  case AMDGPUSubtarget::GFX13:
+    return SIEncodingFamily::GFX13;
   }
   llvm_unreachable("Unknown subtarget generation!");
 }
@@ -9986,7 +10323,8 @@ static bool isRenamedInGFX9(int Opcode) {
 }
 
 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
+  assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
+         "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
 
   unsigned Gen = subtargetEncodingFamily(ST);
 
@@ -10019,9 +10357,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
       Opcode = MFMAOp;
   }
 
-  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+  int64_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
-  if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
+  if (MCOp == (uint32_t)-1 && ST.hasGFX1250Insts())
     MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX12);
 
   // -1 means that Opcode is already a native instruction.
@@ -10029,20 +10367,20 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
     return Opcode;
 
   if (ST.hasGFX90AInsts()) {
-    uint16_t NMCOp = (uint16_t)-1;
+    uint32_t NMCOp = (uint32_t)-1;
     if (ST.hasGFX940Insts())
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940);
-    if (NMCOp == (uint16_t)-1)
+    if (NMCOp == (uint32_t)-1)
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
-    if (NMCOp == (uint16_t)-1)
+    if (NMCOp == (uint32_t)-1)
       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
-    if (NMCOp != (uint16_t)-1)
+    if (NMCOp != (uint32_t)-1)
       MCOp = NMCOp;
   }
 
-  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+  // (uint32_t)-1 means that Opcode is a pseudo instruction that has
   // no encoding in the given subtarget generation.
-  if (MCOp == (uint16_t)-1)
+  if (MCOp == (uint32_t)-1)
     return -1;
 
   if (isAsmOnlyOpcode(MCOp))
@@ -10097,7 +10435,7 @@ static bool followSubRegDef(MachineInstr &MI,
 }
 
 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                                     MachineRegisterInfo &MRI) {
+                                     const MachineRegisterInfo &MRI) {
   assert(MRI.isSSA());
   if (!P.Reg.isVirtual())
     return nullptr;
@@ -10246,7 +10584,7 @@ MachineInstr *SIInstrInfo::createPHISourceCopy(
     InsPt++;
     return BuildMI(MBB, InsPt, DL,
                    get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
-        .addReg(Src, 0, SrcSubReg)
+        .addReg(Src, {}, SrcSubReg)
         .addReg(AMDGPU::EXEC, RegState::Implicit);
   }
   return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
@@ -10310,6 +10648,14 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return SchedModel.computeInstrLatency(&MI);
 }
 
+const MachineOperand &
+SIInstrInfo::getCalleeOperand(const MachineInstr &MI) const {
+  if (const MachineOperand *CallAddrOp =
+          getNamedOperand(MI, AMDGPU::OpName::src0))
+    return *CallAddrOp;
+  return TargetInstrInfo::getCalleeOperand(MI);
+}
+
 InstructionUniformity
 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
@@ -10385,6 +10731,12 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   return InstructionUniformity::Default;
 }
 
+const MIRFormatter *SIInstrInfo::getMIRFormatter() const {
+  if (!Formatter)
+    Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
+  return Formatter.get();
+}
+
 InstructionUniformity
 SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
 
@@ -10438,7 +10790,7 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
     return InstructionUniformity::Default;
   }
 
-  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
 
   // FIXME: It's conceptually broken to report this for an instruction, and not
@@ -10555,6 +10907,135 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
   return false;
 }
 
+static bool isSCCDeadOnExit(MachineBasicBlock *MBB) {
+  for (MachineBasicBlock *S : MBB->successors()) {
+    if (S->isLiveIn(AMDGPU::SCC))
+      return false;
+  }
+  return true;
+}
+
+// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
+// (incoming SCC) = !(SCC defined by SCCDef).
+// Return true if all uses can be re-written, false otherwise.
+bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
+  MachineBasicBlock *MBB = SCCDef->getParent();
+  SmallVector<MachineInstr *> InvertInstr;
+  bool SCCIsDead = false;
+
+  // Scan instructions for SCC uses that need to be inverted until SCC is dead.
+  constexpr unsigned ScanLimit = 12;
+  unsigned Count = 0;
+  for (MachineInstr &MI :
+       make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
+    if (++Count > ScanLimit)
+      return false;
+    if (MI.readsRegister(AMDGPU::SCC, &RI)) {
+      if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
+          MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
+          MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+          MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+        InvertInstr.push_back(&MI);
+      else
+        return false;
+    }
+    if (MI.definesRegister(AMDGPU::SCC, &RI)) {
+      SCCIsDead = true;
+      break;
+    }
+  }
+  if (!SCCIsDead && isSCCDeadOnExit(MBB))
+    SCCIsDead = true;
+
+  // SCC may have more uses.  Can't invert all of them.
+  if (!SCCIsDead)
+    return false;
+
+  // Invert uses
+  for (MachineInstr *MI : InvertInstr) {
+    if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+        MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
+      swapOperands(*MI);
+    } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+               MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
+      MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
+                          ? AMDGPU::S_CBRANCH_SCC1
+                          : AMDGPU::S_CBRANCH_SCC0));
+    } else {
+      llvm_unreachable("SCC used but no inversion handling");
+    }
+  }
+  return true;
+}
+
+// SCC is already valid after SCCValid.
+// SCCRedefine will redefine SCC to the same value already available after
+// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
+// update kill/dead flags if necessary.
+bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+                              bool NeedInversion) const {
+  MachineInstr *KillsSCC = nullptr;
+  if (SCCValid->getParent() != SCCRedefine->getParent())
+    return false;
+  for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
+                                     SCCRedefine->getIterator())) {
+    if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+      return false;
+    if (MI.killsRegister(AMDGPU::SCC, &RI))
+      KillsSCC = &MI;
+  }
+  if (NeedInversion && !invertSCCUse(SCCRedefine))
+    return false;
+  if (MachineOperand *SccDef =
+          SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+    SccDef->setIsDead(false);
+  if (KillsSCC)
+    KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+  SCCRedefine->eraseFromParent();
+  return true;
+}
+
+static bool foldableSelect(const MachineInstr &Def) {
+  if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
+      Def.getOpcode() != AMDGPU::S_CSELECT_B64)
+    return false;
+  bool Op1IsNonZeroImm =
+      Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
+  bool Op2IsZeroImm =
+      Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
+  if (!Op1IsNonZeroImm || !Op2IsZeroImm)
+    return false;
+  return true;
+}
+
+static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
+                                  unsigned &NewDefOpc) {
+  // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
+  // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
+  if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
+      Def.getOpcode() != AMDGPU::S_ADD_U32)
+    return false;
+  const MachineOperand &AddSrc1 = Def.getOperand(1);
+  const MachineOperand &AddSrc2 = Def.getOperand(2);
+  int64_t addend;
+
+  if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
+      (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
+      (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
+      (!getFoldableImm(&AddSrc2, addend) || addend != 1))
+    return false;
+
+  if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
+    const MachineOperand *SccDef =
+        Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
+    if (!SccDef->isDead())
+      return false;
+    NewDefOpc = AMDGPU::S_ADD_U32;
+  }
+  NeedInversion = !NeedInversion;
+  return true;
+}
+
 bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
                                        Register SrcReg2, int64_t CmpMask,
                                        int64_t CmpValue,
@@ -10565,6 +11046,72 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
     return false;
 
+  const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+                                  this](bool NeedInversion) -> bool {
+    if (CmpValue != 0)
+      return false;
+
+    MachineInstr *Def = MRI->getVRegDef(SrcReg);
+    if (!Def)
+      return false;
+
+    // For S_OP that set SCC = DST!=0, do the transformation
+    //
+    //   s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
+    //
+    // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
+    // do the transformation:
+    //
+    //   s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
+    //
+    // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
+    // for S_CSELECT* already has the same value that will be calculated by
+    // s_cmp_lg_*
+    //
+    //   s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
+    //   (non-zero imm), 0)
+
+    unsigned NewDefOpc = Def->getOpcode();
+    if (!setsSCCIfResultIsNonZero(*Def) &&
+        !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
+        !foldableSelect(*Def))
+      return false;
+
+    if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
+      return false;
+
+    if (NewDefOpc != Def->getOpcode())
+      Def->setDesc(get(NewDefOpc));
+
+    // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+    // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+    // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+    //    sX = s_cselect_b64 (non-zero imm), 0
+    //    sLo = copy sX.sub0
+    //    sHi = copy sX.sub1
+    //    sY = s_or_b32 sLo, sHi
+    if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+        MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+      const MachineOperand &OrOpnd1 = Def->getOperand(1);
+      const MachineOperand &OrOpnd2 = Def->getOperand(2);
+      if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+        MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+        MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+        if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+            Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+            Def2->getOperand(1).isReg() &&
+            Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+            Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+            Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+          MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+          if (Select && foldableSelect(*Select))
+            optimizeSCC(Select, Def, /*NeedInversion=*/false);
+        }
+      }
+    }
+    return true;
+  };
+
   const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
                                this](int64_t ExpectedValue, unsigned SrcSize,
                                      bool IsReversible, bool IsSigned) -> bool {
@@ -10591,8 +11138,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
     // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
 
-    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
-    if (!Def || Def->getParent() != CmpInstr.getParent())
+    MachineInstr *Def = MRI->getVRegDef(SrcReg);
+    if (!Def)
       return false;
 
     if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
@@ -10639,17 +11186,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
       return false;
 
-    for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
-         I != E; ++I) {
-      if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
-          I->killsRegister(AMDGPU::SCC, &RI))
-        return false;
-    }
-
-    MachineOperand *SccDef =
-        Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
-    SccDef->setIsDead(false);
-    CmpInstr.eraseFromParent();
+    if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
+      return false;
 
     if (!MRI->use_nodbg_empty(DefReg)) {
       assert(!IsReversedCC);
@@ -10679,7 +11217,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   case AMDGPU::S_CMP_EQ_I32:
   case AMDGPU::S_CMPK_EQ_U32:
   case AMDGPU::S_CMPK_EQ_I32:
-    return optimizeCmpAnd(1, 32, true, false);
+    return optimizeCmpAnd(1, 32, true, false) ||
+           optimizeCmpSelect(/*NeedInversion=*/true);
   case AMDGPU::S_CMP_GE_U32:
   case AMDGPU::S_CMPK_GE_U32:
     return optimizeCmpAnd(1, 32, false, false);
@@ -10692,7 +11231,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   case AMDGPU::S_CMP_LG_I32:
   case AMDGPU::S_CMPK_LG_U32:
   case AMDGPU::S_CMPK_LG_I32:
-    return optimizeCmpAnd(0, 32, true, false);
+    return optimizeCmpAnd(0, 32, true, false) ||
+           optimizeCmpSelect(/*NeedInversion=*/false);
   case AMDGPU::S_CMP_GT_U32:
   case AMDGPU::S_CMPK_GT_U32:
     return optimizeCmpAnd(0, 32, false, false);
@@ -10700,7 +11240,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   case AMDGPU::S_CMPK_GT_I32:
     return optimizeCmpAnd(0, 32, false, true);
   case AMDGPU::S_CMP_LG_U64:
-    return optimizeCmpAnd(0, 64, true, false);
+    return optimizeCmpAnd(0, 64, true, false) ||
+           optimizeCmpSelect(/*NeedInversion=*/false);
   }
 
   return false;
@@ -10731,7 +11272,7 @@ void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI,
       MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
                                        : &AMDGPU::VReg_64_Align2RegClass);
   BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
-      .addReg(DataReg, 0, Op.getSubReg())
+      .addReg(DataReg, {}, Op.getSubReg())
       .addImm(AMDGPU::sub0)
       .addReg(Undef)
       .addImm(AMDGPU::sub1);
@@ -10751,7 +11292,7 @@ bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
   if (!isWMMA(MI) && !isSWMMAC(MI))
     return false;
 
-  if (AMDGPU::isGFX1250(ST))
+  if (ST.hasGFX1250Insts())
     return AMDGPU::getWMMAIsXDL(MI.getOpcode());
 
   return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e979eeb..0b54513 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -52,6 +52,11 @@ static const MachineMemOperand::Flags MOLastUse =
 static const MachineMemOperand::Flags MOCooperative =
     MachineMemOperand::MOTargetFlag3;
 
+/// Mark the MMO of accesses to memory locations that are
+/// never written to by other threads.
+static const MachineMemOperand::Flags MOThreadPrivate =
+    MachineMemOperand::MOTargetFlag4;
+
 /// Utility to store machine instructions worklist.
 struct SIInstrWorklist {
   SIInstrWorklist() = default;
@@ -88,6 +93,8 @@ private:
 };
 
 class SIInstrInfo final : public AMDGPUGenInstrInfo {
+  struct ThreeAddressUpdates;
+
 private:
   const SIRegisterInfo RI;
   const GCNSubtarget &ST;
@@ -123,6 +130,11 @@ public:
       unsigned SubIdx, const TargetRegisterClass *SubRC) const;
 
 private:
+  bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+                   bool NeedInversion) const;
+
+  bool invertSCCUse(MachineInstr *SCCDef) const;
+
   void swapOperands(MachineInstr &Inst) const;
 
   std::pair<bool, MachineBasicBlock *>
@@ -134,6 +146,8 @@ private:
 
   void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
 
+  void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+
   void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
 
   void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,
@@ -170,7 +184,7 @@ private:
   void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
                                     SIInstrWorklist &Worklist) const;
 
-  void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+  void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                     MachineInstr &SCCDefInst,
                                     SIInstrWorklist &Worklist,
                                     Register NewCond = Register()) const;
@@ -190,6 +204,9 @@ private:
 
   bool resultDependsOnExec(const MachineInstr &MI) const;
 
+  MachineInstr *convertToThreeAddressImpl(MachineInstr &MI,
+                                          ThreeAddressUpdates &Updates) const;
+
 protected:
   /// If the specific machine instruction is a instruction that moves/copies
   /// value from one register to another register return destination and source
@@ -291,6 +308,8 @@ public:
   bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
                                int64_t &ImmVal) const override;
 
+  std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
+
   unsigned getVectorRegSpillSaveOpcode(Register Reg,
                                        const TargetRegisterClass *RC,
                                        unsigned Size,
@@ -302,22 +321,20 @@ public:
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
+      unsigned SubReg = 0,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
   // instructions. Returns a pair of generated instructions.
@@ -421,6 +438,9 @@ public:
 
   void removeModOperands(MachineInstr &MI) const;
 
+  void mutateAndCleanupImplicit(MachineInstr &MI,
+                                const MCInstrDesc &NewDesc) const;
+
   /// Return the extracted immediate value in a subregister use from a constant
   /// materialized in a super register.
   ///
@@ -446,7 +466,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SALU;
   }
 
-  bool isSALU(uint16_t Opcode) const {
+  bool isSALU(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SALU;
   }
 
@@ -454,7 +474,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VALU;
   }
 
-  bool isVALU(uint16_t Opcode) const {
+  bool isVALU(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VALU;
   }
 
@@ -462,7 +482,7 @@ public:
     return isMIMG(MI) || isVSAMPLE(MI) || isVIMAGE(MI);
   }
 
-  bool isImage(uint16_t Opcode) const {
+  bool isImage(uint32_t Opcode) const {
     return isMIMG(Opcode) || isVSAMPLE(Opcode) || isVIMAGE(Opcode);
   }
 
@@ -470,7 +490,7 @@ public:
     return isMUBUF(MI) || isMTBUF(MI) || isImage(MI) || isFLAT(MI);
   }
 
-  bool isVMEM(uint16_t Opcode) const {
+  bool isVMEM(uint32_t Opcode) const {
     return isMUBUF(Opcode) || isMTBUF(Opcode) || isImage(Opcode);
   }
 
@@ -478,7 +498,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
   }
 
-  bool isSOP1(uint16_t Opcode) const {
+  bool isSOP1(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP1;
   }
 
@@ -486,7 +506,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
   }
 
-  bool isSOP2(uint16_t Opcode) const {
+  bool isSOP2(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP2;
   }
 
@@ -494,7 +514,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
   }
 
-  bool isSOPC(uint16_t Opcode) const {
+  bool isSOPC(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPC;
   }
 
@@ -502,7 +522,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
   }
 
-  bool isSOPK(uint16_t Opcode) const {
+  bool isSOPK(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPK;
   }
 
@@ -510,7 +530,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
   }
 
-  bool isSOPP(uint16_t Opcode) const {
+  bool isSOPP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPP;
   }
 
@@ -518,7 +538,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsPacked;
   }
 
-  bool isPacked(uint16_t Opcode) const {
+  bool isPacked(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsPacked;
   }
 
@@ -526,7 +546,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
   }
 
-  bool isVOP1(uint16_t Opcode) const {
+  bool isVOP1(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP1;
   }
 
@@ -534,7 +554,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
   }
 
-  bool isVOP2(uint16_t Opcode) const {
+  bool isVOP2(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP2;
   }
 
@@ -544,13 +564,13 @@ public:
 
   static bool isVOP3(const MachineInstr &MI) { return isVOP3(MI.getDesc()); }
 
-  bool isVOP3(uint16_t Opcode) const { return isVOP3(get(Opcode)); }
+  bool isVOP3(uint32_t Opcode) const { return isVOP3(get(Opcode)); }
 
   static bool isSDWA(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
   }
 
-  bool isSDWA(uint16_t Opcode) const {
+  bool isSDWA(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SDWA;
   }
 
@@ -558,7 +578,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
   }
 
-  bool isVOPC(uint16_t Opcode) const {
+  bool isVOPC(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOPC;
   }
 
@@ -566,7 +586,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
   }
 
-  bool isMUBUF(uint16_t Opcode) const {
+  bool isMUBUF(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
   }
 
@@ -574,15 +594,19 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
   }
 
-  bool isMTBUF(uint16_t Opcode) const {
+  bool isMTBUF(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
   }
 
+  static bool isBUF(const MachineInstr &MI) {
+    return isMUBUF(MI) || isMTBUF(MI);
+  }
+
   static bool isSMRD(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
   }
 
-  bool isSMRD(uint16_t Opcode) const {
+  bool isSMRD(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SMRD;
   }
 
@@ -592,33 +616,35 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::DS;
   }
 
-  bool isDS(uint16_t Opcode) const {
+  bool isDS(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
   static bool isLDSDMA(const MachineInstr &MI) {
-    return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+    return (isVALU(MI) && (isMUBUF(MI) || isFLAT(MI))) ||
+           (MI.getDesc().TSFlags & SIInstrFlags::TENSOR_CNT);
   }
 
-  bool isLDSDMA(uint16_t Opcode) {
-    return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+  bool isLDSDMA(uint32_t Opcode) {
+    return (isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode))) ||
+           (get(Opcode).TSFlags & SIInstrFlags::TENSOR_CNT);
   }
 
   static bool isGWS(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::GWS;
   }
 
-  bool isGWS(uint16_t Opcode) const {
+  bool isGWS(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::GWS;
   }
 
-  bool isAlwaysGDS(uint16_t Opcode) const;
+  bool isAlwaysGDS(uint32_t Opcode) const;
 
   static bool isMIMG(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
   }
 
-  bool isMIMG(uint16_t Opcode) const {
+  bool isMIMG(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MIMG;
   }
 
@@ -626,7 +652,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VIMAGE;
   }
 
-  bool isVIMAGE(uint16_t Opcode) const {
+  bool isVIMAGE(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VIMAGE;
   }
 
@@ -634,7 +660,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VSAMPLE;
   }
 
-  bool isVSAMPLE(uint16_t Opcode) const {
+  bool isVSAMPLE(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VSAMPLE;
   }
 
@@ -642,7 +668,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
   }
 
-  bool isGather4(uint16_t Opcode) const {
+  bool isGather4(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::Gather4;
   }
 
@@ -657,7 +683,7 @@ public:
     return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
   }
 
-  bool isSegmentSpecificFLAT(uint16_t Opcode) const {
+  bool isSegmentSpecificFLAT(uint32_t Opcode) const {
     auto Flags = get(Opcode).TSFlags;
     return Flags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
   }
@@ -666,7 +692,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FlatGlobal;
   }
 
-  bool isFLATGlobal(uint16_t Opcode) const {
+  bool isFLATGlobal(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FlatGlobal;
   }
 
@@ -674,20 +700,20 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FlatScratch;
   }
 
-  bool isFLATScratch(uint16_t Opcode) const {
+  bool isFLATScratch(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FlatScratch;
   }
 
   // Any FLAT encoded instruction, including global_* and scratch_*.
-  bool isFLAT(uint16_t Opcode) const {
+  bool isFLAT(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
   }
 
-  /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
-  /// SCRATCH_ memory operands.
+  /// \returns true for SCRATCH_ instructions, or FLAT/BUF instructions unless
+  /// the MMOs do not include scratch.
   /// Conservatively correct; will return true if \p MI cannot be proven
   /// to not hit scratch.
-  bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+  bool mayAccessScratch(const MachineInstr &MI) const;
 
   /// \returns true for FLAT instructions that can access VMEM.
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
@@ -695,7 +721,7 @@ public:
   /// \returns true for FLAT instructions that can access LDS.
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
 
-  static bool isBlockLoadStore(uint16_t Opcode) {
+  static bool isBlockLoadStore(uint32_t Opcode) {
     switch (Opcode) {
     case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
     case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
@@ -709,6 +735,52 @@ public:
     }
   }
 
+  static bool setsSCCIfResultIsNonZero(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::S_ABSDIFF_I32:
+    case AMDGPU::S_ABS_I32:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_ANDN2_B32:
+    case AMDGPU::S_ANDN2_B64:
+    case AMDGPU::S_ASHR_I32:
+    case AMDGPU::S_ASHR_I64:
+    case AMDGPU::S_BCNT0_I32_B32:
+    case AMDGPU::S_BCNT0_I32_B64:
+    case AMDGPU::S_BCNT1_I32_B32:
+    case AMDGPU::S_BCNT1_I32_B64:
+    case AMDGPU::S_BFE_I32:
+    case AMDGPU::S_BFE_I64:
+    case AMDGPU::S_BFE_U32:
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_LSHL_B32:
+    case AMDGPU::S_LSHL_B64:
+    case AMDGPU::S_LSHR_B32:
+    case AMDGPU::S_LSHR_B64:
+    case AMDGPU::S_NAND_B32:
+    case AMDGPU::S_NAND_B64:
+    case AMDGPU::S_NOR_B32:
+    case AMDGPU::S_NOR_B64:
+    case AMDGPU::S_NOT_B32:
+    case AMDGPU::S_NOT_B64:
+    case AMDGPU::S_OR_B32:
+    case AMDGPU::S_OR_B64:
+    case AMDGPU::S_ORN2_B32:
+    case AMDGPU::S_ORN2_B64:
+    case AMDGPU::S_QUADMASK_B32:
+    case AMDGPU::S_QUADMASK_B64:
+    case AMDGPU::S_WQM_B32:
+    case AMDGPU::S_WQM_B64:
+    case AMDGPU::S_XNOR_B32:
+    case AMDGPU::S_XNOR_B64:
+    case AMDGPU::S_XOR_B32:
+    case AMDGPU::S_XOR_B64:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   static bool isEXP(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::EXP;
   }
@@ -721,7 +793,7 @@ public:
            Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND1;
   }
 
-  bool isEXP(uint16_t Opcode) const {
+  bool isEXP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::EXP;
   }
 
@@ -729,7 +801,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicNoRet;
   }
 
-  bool isAtomicNoRet(uint16_t Opcode) const {
+  bool isAtomicNoRet(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsAtomicNoRet;
   }
 
@@ -737,7 +809,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsAtomicRet;
   }
 
-  bool isAtomicRet(uint16_t Opcode) const {
+  bool isAtomicRet(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsAtomicRet;
   }
 
@@ -746,13 +818,17 @@ public:
                                    SIInstrFlags::IsAtomicNoRet);
   }
 
-  bool isAtomic(uint16_t Opcode) const {
+  bool isAtomic(uint32_t Opcode) const {
     return get(Opcode).TSFlags & (SIInstrFlags::IsAtomicRet |
                                   SIInstrFlags::IsAtomicNoRet);
   }
 
   static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
-    return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+    unsigned Opc = MI.getOpcode();
+    // Exclude instructions that read FROM LDS (not write to it)
+    return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD &&
+           Opc != AMDGPU::TENSOR_STORE_FROM_LDS &&
+           Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2;
   }
 
   static bool isSBarrierSCCWrite(unsigned Opcode) {
@@ -771,7 +847,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }
 
-  bool isWQM(uint16_t Opcode) const {
+  bool isWQM(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::WQM;
   }
 
@@ -779,7 +855,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
   }
 
-  bool isDisableWQM(uint16_t Opcode) const {
+  bool isDisableWQM(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
   }
 
@@ -794,7 +870,7 @@ public:
            (isSpill(MI) && isVALU(MI));
   }
 
-  bool isVGPRSpill(uint16_t Opcode) const {
+  bool isVGPRSpill(uint32_t Opcode) const {
     return Opcode != AMDGPU::SI_SPILL_S32_TO_VGPR &&
            Opcode != AMDGPU::SI_RESTORE_S32_FROM_VGPR &&
            (isSpill(Opcode) && isVALU(Opcode));
@@ -806,13 +882,13 @@ public:
            (isSpill(MI) && isSALU(MI));
   }
 
-  bool isSGPRSpill(uint16_t Opcode) const {
+  bool isSGPRSpill(uint32_t Opcode) const {
     return Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR ||
            Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
            (isSpill(Opcode) && isSALU(Opcode));
   }
 
-  bool isSpill(uint16_t Opcode) const {
+  bool isSpill(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::Spill;
   }
 
@@ -822,7 +898,7 @@ public:
 
   static bool isSpill(const MachineInstr &MI) { return isSpill(MI.getDesc()); }
 
-  static bool isWWMRegSpillOpcode(uint16_t Opcode) {
+  static bool isWWMRegSpillOpcode(uint32_t Opcode) {
     return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
            Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||
            Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE ||
@@ -838,7 +914,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::DPP;
   }
 
-  bool isDPP(uint16_t Opcode) const {
+  bool isDPP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DPP;
   }
 
@@ -846,7 +922,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::TRANS;
   }
 
-  bool isTRANS(uint16_t Opcode) const {
+  bool isTRANS(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::TRANS;
   }
 
@@ -854,7 +930,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
   }
 
-  bool isVOP3P(uint16_t Opcode) const {
+  bool isVOP3P(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
   }
 
@@ -862,7 +938,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
   }
 
-  bool isVINTRP(uint16_t Opcode) const {
+  bool isVINTRP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
   }
 
@@ -872,13 +948,18 @@ public:
 
   static bool isMAI(const MachineInstr &MI) { return isMAI(MI.getDesc()); }
 
-  bool isMAI(uint16_t Opcode) const { return isMAI(get(Opcode)); }
+  bool isMAI(uint32_t Opcode) const { return isMAI(get(Opcode)); }
 
   static bool isMFMA(const MachineInstr &MI) {
     return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   }
 
+  bool isMFMA(uint32_t Opcode) const {
+    return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+  }
+
   static bool isDOT(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
   }
@@ -887,7 +968,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA;
   }
 
-  bool isWMMA(uint16_t Opcode) const {
+  bool isWMMA(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsWMMA;
   }
 
@@ -895,15 +976,19 @@ public:
     return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
   }
 
+  bool isMFMAorWMMA(uint32_t Opcode) const {
+    return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+  }
+
   static bool isSWMMAC(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
   }
 
-  bool isSWMMAC(uint16_t Opcode) const {
+  bool isSWMMAC(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
   }
 
-  bool isDOT(uint16_t Opcode) const {
+  bool isDOT(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
   }
 
@@ -917,7 +1002,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR;
   }
 
-  bool isLDSDIR(uint16_t Opcode) const {
+  bool isLDSDIR(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::LDSDIR;
   }
 
@@ -925,7 +1010,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::VINTERP;
   }
 
-  bool isVINTERP(uint16_t Opcode) const {
+  bool isVINTERP(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VINTERP;
   }
 
@@ -941,6 +1026,14 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
   }
 
+  static bool usesASYNC_CNT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::ASYNC_CNT;
+  }
+
+  bool usesASYNC_CNT(uint32_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT;
+  }
+
   // Most sopk treat the immediate as a signed 16-bit, however some
   // use it as unsigned.
   static bool sopkIsZext(unsigned Opcode) {
@@ -957,7 +1050,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE;
   }
 
-  bool isScalarStore(uint16_t Opcode) const {
+  bool isScalarStore(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE;
   }
 
@@ -965,7 +1058,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;
   }
 
-  bool isFixedSize(uint16_t Opcode) const {
+  bool isFixedSize(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
   }
 
@@ -973,7 +1066,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FPClamp;
   }
 
-  bool hasFPClamp(uint16_t Opcode) const {
+  bool hasFPClamp(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FPClamp;
   }
 
@@ -993,7 +1086,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
   }
 
-  bool usesFPDPRounding(uint16_t Opcode) const {
+  bool usesFPDPRounding(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
   }
 
@@ -1001,7 +1094,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::FPAtomic;
   }
 
-  bool isFPAtomic(uint16_t Opcode) const {
+  bool isFPAtomic(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FPAtomic;
   }
 
@@ -1046,7 +1139,7 @@ public:
     return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
   }
 
-  bool doesNotReadTiedSource(uint16_t Opcode) const {
+  bool doesNotReadTiedSource(uint32_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
   }
 
@@ -1114,13 +1207,13 @@ public:
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(isCopyInstr(MI));
     Register Dest = MI.getOperand(0).getReg();
-    const MachineFunction &MF = *MI.getParent()->getParent();
+    const MachineFunction &MF = *MI.getMF();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     return !RI.isSGPRReg(MRI, Dest);
   }
 
   bool hasVGPRUses(const MachineInstr &MI) const {
-    const MachineFunction &MF = *MI.getParent()->getParent();
+    const MachineFunction &MF = *MI.getMF();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     return llvm::any_of(MI.explicit_uses(),
                         [&MRI, this](const MachineOperand &MO) {
@@ -1289,7 +1382,7 @@ public:
 
   /// Return the size in bytes of the operand OpNo on the given
   // instruction opcode.
-  unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+  unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const {
     const MCOperandInfo &OpInfo = get(Opcode).operands()[OpNo];
 
     if (OpInfo.RegClass == -1) {
@@ -1501,6 +1594,8 @@ public:
   bool isBasicBlockPrologue(const MachineInstr &MI,
                             Register Reg = Register()) const override;
 
+  bool canAddToBBProlog(const MachineInstr &MI) const;
+
   MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator InsPt,
                                          const DebugLoc &DL, Register Src,
@@ -1562,10 +1657,6 @@ public:
   /// Return true if this opcode should not be used by codegen.
   bool isAsmOnlyOpcode(int MCOp) const;
 
-  const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const override;
-
   void fixImplicitOperands(MachineInstr &MI) const;
 
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
@@ -1579,22 +1670,21 @@ public:
                            const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
+  const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
+
   InstructionUniformity
-  getInstructionUniformity(const MachineInstr &MI) const override final;
+  getInstructionUniformity(const MachineInstr &MI) const final;
 
   InstructionUniformity
   getGenericInstructionUniformity(const MachineInstr &MI) const;
 
-  const MIRFormatter *getMIRFormatter() const override {
-    if (!Formatter)
-      Formatter = std::make_unique<AMDGPUMIRFormatter>();
-    return Formatter.get();
-  }
+  const MIRFormatter *getMIRFormatter() const override;
 
   static unsigned getDSShaderTypeValue(const MachineFunction &MF);
 
   const TargetSchedModel &getSchedModel() const { return SchedModel; }
 
+  // FIXME: This should be removed
   // Enforce operand's \p OpName even alignment if required by target.
   // This is used if an operand is a 32 bit register but needs to be aligned
   // regardless.
@@ -1627,7 +1717,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
 /// skipping copy like instructions and subreg-manipulation pseudos.
 /// Following another subreg of a reg:subreg isn't supported.
 MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                               MachineRegisterInfo &MRI);
+                               const MachineRegisterInfo &MRI);
 
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
 /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
@@ -1647,86 +1737,86 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
 namespace AMDGPU {
 
   LLVM_READONLY
-  int getVOPe64(uint16_t Opcode);
+  int64_t getVOPe64(uint32_t Opcode);
 
   LLVM_READONLY
-  int getVOPe32(uint16_t Opcode);
+  int64_t getVOPe32(uint32_t Opcode);
 
   LLVM_READONLY
-  int getSDWAOp(uint16_t Opcode);
+  int64_t getSDWAOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int getDPPOp32(uint16_t Opcode);
+  int64_t getDPPOp32(uint32_t Opcode);
 
   LLVM_READONLY
-  int getDPPOp64(uint16_t Opcode);
+  int64_t getDPPOp64(uint32_t Opcode);
 
   LLVM_READONLY
-  int getBasicFromSDWAOp(uint16_t Opcode);
+  int64_t getBasicFromSDWAOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int getCommuteRev(uint16_t Opcode);
+  int64_t getCommuteRev(uint32_t Opcode);
 
   LLVM_READONLY
-  int getCommuteOrig(uint16_t Opcode);
+  int64_t getCommuteOrig(uint32_t Opcode);
 
   LLVM_READONLY
-  int getAddr64Inst(uint16_t Opcode);
+  int64_t getAddr64Inst(uint32_t Opcode);
 
   /// Check if \p Opcode is an Addr64 opcode.
   ///
   /// \returns \p Opcode if it is an Addr64 opcode, otherwise -1.
   LLVM_READONLY
-  int getIfAddr64Inst(uint16_t Opcode);
+  int64_t getIfAddr64Inst(uint32_t Opcode);
 
   LLVM_READONLY
-  int getSOPKOp(uint16_t Opcode);
+  int64_t getSOPKOp(uint32_t Opcode);
 
   /// \returns SADDR form of a FLAT Global instruction given an \p Opcode
   /// of a VADDR form.
   LLVM_READONLY
-  int getGlobalSaddrOp(uint16_t Opcode);
+  int64_t getGlobalSaddrOp(uint32_t Opcode);
 
   /// \returns VADDR form of a FLAT Global instruction given an \p Opcode
   /// of a SADDR form.
   LLVM_READONLY
-  int getGlobalVaddrOp(uint16_t Opcode);
+  int64_t getGlobalVaddrOp(uint32_t Opcode);
 
   LLVM_READONLY
-  int getVCMPXNoSDstOp(uint16_t Opcode);
+  int64_t getVCMPXNoSDstOp(uint32_t Opcode);
 
   /// \returns ST form with only immediate offset of a FLAT Scratch instruction
   /// given an \p Opcode of an SS (SADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSTfromSS(uint16_t Opcode);
+  int64_t getFlatScratchInstSTfromSS(uint32_t Opcode);
 
   /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SVS (SADDR + VADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSVfromSVS(uint16_t Opcode);
+  int64_t getFlatScratchInstSVfromSVS(uint32_t Opcode);
 
   /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SV (VADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSSfromSV(uint16_t Opcode);
+  int64_t getFlatScratchInstSSfromSV(uint32_t Opcode);
 
   /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode
   /// of an SS (SADDR) form.
   LLVM_READONLY
-  int getFlatScratchInstSVfromSS(uint16_t Opcode);
+  int64_t getFlatScratchInstSVfromSS(uint32_t Opcode);
 
   /// \returns earlyclobber version of a MAC MFMA is exists.
   LLVM_READONLY
-  int getMFMAEarlyClobberOp(uint16_t Opcode);
+  int64_t getMFMAEarlyClobberOp(uint32_t Opcode);
 
   /// \returns Version of an MFMA instruction which uses AGPRs for srcC and
   /// vdst, given an \p Opcode of an MFMA which uses VGPRs for srcC/vdst.
   LLVM_READONLY
-  int getMFMASrcCVDstAGPROp(uint16_t Opcode);
+  int64_t getMFMASrcCVDstAGPROp(uint32_t Opcode);
 
   /// \returns v_cmpx version of a v_cmp instruction.
   LLVM_READONLY
-  int getVCMPXOpFromVCMP(uint16_t Opcode);
+  int64_t getVCMPXOpFromVCMP(uint32_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b7f63ec..f063b4e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -6,13 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-def isWave32 : Predicate<"Subtarget->isWave32()">,
-  AssemblerPredicate <(any_of FeatureWavefrontSize32,
-                              FeatureAssemblerPermissiveWavesize)>;
-def isWave64 : Predicate<"Subtarget->isWave64()">,
-  AssemblerPredicate <(any_of FeatureWavefrontSize64,
-                              FeatureAssemblerPermissiveWavesize)>;
-
 class AMDGPUMnemonicAlias<string From, string To, string VariantName = "">
     : MnemonicAlias<From, To, VariantName>, PredicateControl;
 
@@ -34,6 +27,7 @@ def SIEncodingFamily {
   int GFX11 = 10;
   int GFX12 = 11;
   int GFX1250 = 12;
+  int GFX13 = 13;
 }
 
 //===----------------------------------------------------------------------===//
@@ -47,6 +41,7 @@ class GFXGen<Predicate pred, string dn, string suffix, int sub> {
   int Subtarget = sub;
 }
 
+def GFX13Gen         : GFXGen<isGFX13Only, "GFX13", "_gfx13", SIEncodingFamily.GFX13>;
 def GFX1250Gen       : GFXGen<isGFX125xOnly, "GFX1250", "_gfx1250", SIEncodingFamily.GFX1250>;
 def GFX12Not12_50Gen : GFXGen<isGFX12Not12_50, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
 def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
@@ -57,6 +52,8 @@ def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.GFX10>;
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
+// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
+// modifier behavior with dx10_enable.
 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
 def SDTSBufferLoad : SDTypeProfile<1, 3,
@@ -331,7 +328,7 @@ def mfma_f32_32x32x64_f8f6f4 : UnscaledMFMAOptimizationPat<int_amdgcn_mfma_scale
 //===----------------------------------------------------------------------===//
 
 class isIntType<ValueType SrcVT> {
-  bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
+  bit ret = !and(SrcVT.isInteger, !ne(SrcVT, i1));
 }
 
 def SDTSBufferPrefetch : SDTypeProfile<0, 3,
@@ -776,11 +773,7 @@ def xnor : PatFrag <
 foreach I = 1-4 in {
 def shl#I#_add : PatFrag <
   (ops node:$src0, node:$src1),
-  (add (shl_oneuse $src0, (i32 I)), $src1)> {
-  // FIXME: Poor substitute for disabling pattern in SelectionDAG
-  let PredicateCode = [{return false;}];
-  let GISelPredicateCode = [{return true;}];
-}
+  (add (shl_oneuse $src0, (i32 I)), $src1)>;
 }
 
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
@@ -818,6 +811,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
 defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
 defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
 defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
 defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
 defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
 defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
@@ -963,6 +958,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{
   return isInlineImmediate(Imm);
 }]>;
 
+def fpimm_pos_zero : FPImmLeaf<fAny, [{
+  return Imm.isZero() && !Imm.isNegative();
+}]>;
 
 class VGPRImm <dag frag> : PatLeaf<frag, [{
   return isVGPRImm(N);
@@ -991,6 +989,11 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
   return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
 
+
+class build_vector_fpimm_pos_zero_v2<VTVec vec> : PatLeaf<
+  (vec (build_vector (vec.ElementType fpimm_pos_zero),
+                     (vec.ElementType fpimm_pos_zero)))>;
+
 def MFMALdScaleXForm : SDNodeXForm<timm, [{
   unsigned Val = N->getZExtValue();
   unsigned New = 0;
@@ -1001,11 +1004,13 @@ def MFMALdScaleXForm : SDNodeXForm<timm, [{
   return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
 }]>;
 
-def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{
+def fcanonicalize_canonicalized
+    : PatFrag<(ops node:$op), (fcanonicalize node:$op), [{
   const SITargetLowering &Lowering =
       *static_cast<const SITargetLowering *>(getTargetLowering());
-  return Lowering.isCanonicalized(*CurDAG, Op);
+  return Lowering.isCanonicalized(*CurDAG, Op->getOperand(0), N->getFlags());
 }]> {
+  // FIXME: This predicate for GlobalISel is dead code.
   let GISelPredicateCode = [{
     const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
         MF.getSubtarget().getTargetLowering());
@@ -1084,6 +1089,8 @@ def VReg32OrOffClass : AsmOperandClass {
 
 def SendMsg : CustomOperand<i32>;
 
+def WaitEvent : CustomOperand<i16>;
+
 def Swizzle : CustomOperand<i16, 1>;
 
 def Endpgm : CustomOperand<i16, 1>;
@@ -1197,12 +1204,12 @@ class NamedIntOperand<string prefix, bit Optional = 1, string name = NAME>
                     !if(AlwaysPrint, "true", "false")#"); }";
 }
 
-class NamedBitOperand<string Id, string Name = NAME>
+class NamedBitOperand<string Id, string Name = NAME, bit AlwaysIgnoreNegative = 0>
     : CustomOperand<i1, 1, Name> {
   let PredicateMethod = "isImmTy<AMDGPUOperand::"#ImmTy#">";
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
-    "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy#"); }";
+    "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy# !if(AlwaysIgnoreNegative, ", true", ", false")#"); }";
   let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "#
     "const MCSubtargetInfo &STI, raw_ostream &O) { "#
     "printNamedBit(MI, OpNo, O, \""#Id#"\"); }";
@@ -1260,6 +1267,8 @@ def CPol_NonGLC : ValuePredicatedOperand<CPol, "!(Op.getImm() & CPol::GLC)", 1>;
 def CPol_GLC_WithDefault : DefaultOperand<CPol_GLC, !shl(1, CPolBit.GLC)>;
 def CPol_NonGLC_WithDefault : DefaultOperand<CPol_NonGLC, 0>;
 
+def IsAsync : NamedBitOperand<"isasync">;
+
 def TFE : NamedBitOperand<"tfe">;
 def UNorm : NamedBitOperand<"unorm">;
 def DA : NamedBitOperand<"da">;
@@ -1267,8 +1276,10 @@ def R128A16 : CustomOperand<i1, 1>;
 def A16 : NamedBitOperand<"a16">;
 def D16 : NamedBitOperand<"d16">;
 def LWE : NamedBitOperand<"lwe">;
-def exp_compr : NamedBitOperand<"compr", "ExpCompr">;
-def exp_vm : NamedBitOperand<"vm", "ExpVM">;
+def exp_compr : NamedBitOperand<"compr", "ExpCompr", 1>;
+def exp_vm : NamedBitOperand<"vm", "ExpVM", 1>;
+def exp_done : NamedBitOperand<"done", "Done", 1>;
+def exp_row_en : NamedBitOperand<"row_en", "RowEn", 1>;
 
 def FORMAT : CustomOperand<i8>;
 
@@ -1796,10 +1807,10 @@ class SIMCInstr <string pseudo, int subtarget> {
 
 class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
   int ret =
-    !if (!eq(Src0.Value, untyped.Value),      0,
-      !if (!eq(Src1.Value, untyped.Value),    1,   // VOP1
-         !if (!eq(Src2.Value, untyped.Value), 2,   // VOP2
-                                              3))); // VOP3
+    !if (!eq(Src0, untyped),      0,
+      !if (!eq(Src1, untyped),    1,   // VOP1
+         !if (!eq(Src2, untyped), 2,   // VOP2
+                                  3))); // VOP3
 }
 
 // Returns the register class to use for the destination of VOP[123C]
@@ -1868,17 +1879,17 @@ class getVCSrcForVT<ValueType VT> {
     !if(VT.isFP,
       !if(!eq(VT.Size, 64),
          VCSrc_f64,
-         !cond(!eq(VT.Value, f16.Value)    : VCSrc_f16,
-               !eq(VT.Value, bf16.Value)   : VCSrc_bf16,
-               !eq(VT.Value, v2f16.Value)  : VCSrc_v2f16,
-               !eq(VT.Value, v2bf16.Value) : VCSrc_v2bf16,
+         !cond(!eq(VT, f16)    : VCSrc_f16,
+               !eq(VT, bf16)   : VCSrc_bf16,
+               !eq(VT, v2f16)  : VCSrc_v2f16,
+               !eq(VT, v2bf16) : VCSrc_v2bf16,
                1 : VCSrc_f32)
        ),
        !if(!eq(VT.Size, 64),
           VCSrc_b64,
-          !if(!eq(VT.Value, i16.Value),
+          !if(!eq(VT, i16),
              VCSrc_b16,
-             !if(!eq(VT.Value, v2i16.Value),
+             !if(!eq(VT, v2i16),
                 VCSrc_v2b16,
                 VCSrc_b32
              )
@@ -2003,28 +2014,28 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
 
 // Float or packed int
 class isModifierType<ValueType SrcVT> {
-  bit ret = !or(!eq(SrcVT.Value, f16.Value),
-                !eq(SrcVT.Value, bf16.Value),
-                !eq(SrcVT.Value, f32.Value),
-                !eq(SrcVT.Value, f64.Value),
-                !eq(SrcVT.Value, v2f16.Value),
-                !eq(SrcVT.Value, v2i16.Value),
-                !eq(SrcVT.Value, v2bf16.Value),
-                !eq(SrcVT.Value, v2f32.Value),
-                !eq(SrcVT.Value, v2i32.Value),
-                !eq(SrcVT.Value, v4f16.Value),
-                !eq(SrcVT.Value, v4i16.Value),
-                !eq(SrcVT.Value, v4bf16.Value),
-                !eq(SrcVT.Value, v4f32.Value),
-                !eq(SrcVT.Value, v4i32.Value),
-                !eq(SrcVT.Value, v8f16.Value),
-                !eq(SrcVT.Value, v8i16.Value),
-                !eq(SrcVT.Value, v8bf16.Value),
-                !eq(SrcVT.Value, v8f32.Value),
-                !eq(SrcVT.Value, v8i32.Value),
-                !eq(SrcVT.Value, v16f16.Value),
-                !eq(SrcVT.Value, v16i16.Value),
-                !eq(SrcVT.Value, v16bf16.Value));
+  bit ret = !or(!eq(SrcVT, f16),
+                !eq(SrcVT, bf16),
+                !eq(SrcVT, f32),
+                !eq(SrcVT, f64),
+                !eq(SrcVT, v2f16),
+                !eq(SrcVT, v2i16),
+                !eq(SrcVT, v2bf16),
+                !eq(SrcVT, v2f32),
+                !eq(SrcVT, v2i32),
+                !eq(SrcVT, v4f16),
+                !eq(SrcVT, v4i16),
+                !eq(SrcVT, v4bf16),
+                !eq(SrcVT, v4f32),
+                !eq(SrcVT, v4i32),
+                !eq(SrcVT, v8f16),
+                !eq(SrcVT, v8i16),
+                !eq(SrcVT, v8bf16),
+                !eq(SrcVT, v8f32),
+                !eq(SrcVT, v8i32),
+                !eq(SrcVT, v16f16),
+                !eq(SrcVT, v16i16),
+                !eq(SrcVT, v16bf16));
 }
 
 // Return type of input modifiers operand for specified input operand.
@@ -2057,9 +2068,9 @@ class getSrcModDPP <ValueType VT> {
 class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
   Operand ret =
       !if (VT.isFP,
-           !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+           !if (!or(!eq(VT, f16), !eq(VT, bf16)),
                 FPT16_Lo128VRegInputMods<IsFake16>, FPVRegInputMods),
-           !if (!eq(VT.Value, i16.Value),
+           !if (!eq(VT, i16),
                 IntT16_Lo128VRegInputMods<IsFake16>, IntVRegInputMods));
 }
 
@@ -2068,11 +2079,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
 class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> {
   Operand ret =
       !if (VT.isFP,
-           !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+           !if (!or(!eq(VT, f16), !eq(VT, bf16)),
                 FPT16VCSrcInputMods<IsFake16>,
-                !if (!eq(VT.Value, f64.Value), FP64VCSrcInputMods,
+                !if (!eq(VT, f64), FP64VCSrcInputMods,
                      FP32VCSrcInputMods)),
-           !if (!eq(VT.Value, i16.Value),
+           !if (!eq(VT, i16),
                 IntT16VCSrcInputMods<IsFake16>,
                 Int32VCSrcInputMods));
 }
@@ -2084,15 +2095,15 @@ class getSrcModVOP3VC <ValueType VT, bit IsFake16 = 1> {
 class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> {
   defvar T16Dst =
       !if (VT.isFP,
-           !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+           !if (!or(!eq(VT, f16), !eq(VT, bf16)),
                 FPT16VRegInputMods<IsFake16>, FPVRegT16DstInputMods),
-           !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods<IsFake16>,
+           !if (!eq(VT, i16), IntT16VRegInputMods<IsFake16>,
                 IntVRegT16DstInputMods));
   defvar Normal =
       !if (VT.isFP,
-           !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+           !if (!or(!eq(VT, f16), !eq(VT, bf16)),
                 FPT16VRegInputMods<IsFake16>, FPVRegInputMods),
-           !if (!eq(VT.Value, i16.Value),
+           !if (!eq(VT, i16),
                 IntT16VRegInputMods<IsFake16>,
                 IntVRegInputMods));
   Operand ret = !if(!and(!not(IsFake16), !eq(DstVT.Size, 16)), T16Dst, Normal);
@@ -2102,16 +2113,16 @@ class getSrc0ModVOP3DPP <ValueType VT, ValueType DstVT, bit IsFake16 = 1> {
 // only operands (VOPD3 vsrc1 and vsrc2).
 class getSrcModVOP3V <ValueType VT> {
   Operand ret =
-      !if (!eq(VT.Value, f64.Value), FP64VRegSrcInputMods,
+      !if (!eq(VT, f64), FP64VRegSrcInputMods,
            FP32VRegSrcInputMods);
 }
 
 // Return type of input modifiers operand specified input operand for SDWA
 class getSrcModSDWA <ValueType VT> {
-  Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
-                !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
-                !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
-                !if(!eq(VT.Value, bf16.Value), FP16SDWAInputMods,
+  Operand ret = !if(!eq(VT, f16), FP16SDWAInputMods,
+                !if(!eq(VT, f32), FP32SDWAInputMods,
+                !if(!eq(VT, i16), Int16SDWAInputMods,
+                !if(!eq(VT, bf16), FP16SDWAInputMods,
                 Int32SDWAInputMods))));
 }
 
@@ -2778,14 +2789,14 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit HasFP8ByteSel = !or(HasFP8SrcByteSel, HasFP8DstByteSel);
   field bit HasBitOp3 = 0;
 
-  field bit HasDst = !ne(DstVT.Value, untyped.Value);
+  field bit HasDst = !ne(DstVT, untyped);
   field bit HasDst32 = HasDst;
   field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
   field bit EmitDstSel = EmitDst;
   field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
-  field bit HasSrc0 = !ne(Src0VT.Value, untyped.Value);
-  field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
-  field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value);
+  field bit HasSrc0 = !ne(Src0VT, untyped);
+  field bit HasSrc1 = !ne(Src1VT, untyped);
+  field bit HasSrc2 = !ne(Src2VT, untyped);
 
   field bit HasSrc0FloatMods = Src0VT.isFP;
   field bit HasSrc1FloatMods = Src1VT.isFP;
@@ -3364,7 +3375,8 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.GFX940)],
                    [!cast<string>(SIEncodingFamily.GFX11)],
                    [!cast<string>(SIEncodingFamily.GFX12)],
-                   [!cast<string>(SIEncodingFamily.GFX1250)]];
+                   [!cast<string>(SIEncodingFamily.GFX1250)],
+                   [!cast<string>(SIEncodingFamily.GFX13)]];
 }
 
 // Get equivalent SOPK instruction.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 27e5ee9c..cde3523 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 
 // 64-bit vector move instruction. This is mainly used by the
 // SIFoldOperands pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst),
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)> {
   let isReMaterializable = 1;
   let isAsCheapAsAMove = 1;
@@ -328,7 +328,7 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
 
 multiclass
     AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
-  let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+  let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, UseNamedOperandTable = 1, Uses = [EXEC] in {
     def !toupper(Op) #"_PSEUDO_" #DataType
         : VPseudoInstSI<(outs RetReg : $sdst),
                         (ins Reg : $src, VSrc_b32 : $strategy),
@@ -348,7 +348,7 @@ class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
 
 // Input list : [Operation_name,
 //              type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
-//              bit-width
+//              input-type
 //              output register class,
 //              input register class]
 defvar Operations = [
@@ -371,6 +371,15 @@ defvar Operations = [
   WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
   WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
   WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
+
+  WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"fmin", "F64", f64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"fmax", "F64", f64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"fadd", "F64", f64, SGPR_64, VSrc_b64>,
+  WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"fsub", "F64", f64, SGPR_64, VSrc_b64>,
 ];
 
 foreach Op = Operations in {
@@ -791,6 +800,17 @@ def : GCNPat<
   (SI_CALL_ISEL $src0, (i64 0))
 >;
 
+// Funnel shift right (fshr) patterns for uniform inputs.
+// These patterns implement this using scalar instructions by constructing a 64-bit
+// value {a, b} and performing a single right shift.
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
+  (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
+>;
+
+def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
+  (i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
+>;
+
 // Wrapper around s_swappc_b64 with extra $callee parameter to track
 // the called function after regalloc.
 def SI_CALL : SPseudoInstSI <
@@ -804,9 +824,8 @@ def SI_CALL : SPseudoInstSI <
   let isConvergent = 1;
 }
 
-class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
-  (ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
-  [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
+class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []>
+  : SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> {
   let Size = 4;
   let FixedSize = 1;
   let isCall = 1;
@@ -820,8 +839,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
 }
 
 // Tail call handling pseudo
-def SI_TCRETURN :     SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
-def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
+def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64,
+  [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64,
+  [(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+
+// Tail call for chain calling conventions.
+// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls
+// never return and don't need to preserve any SGPRs.
+def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>;
 
 // Handle selecting indirect tail calls
 def : GCNPat<
@@ -851,13 +877,13 @@ multiclass SI_CS_CHAIN_TC<
     // This is essentially a tail call, but it also takes a mask to put in EXEC
     // right before jumping to the callee.
     def NAME: SPseudoInstSI <(outs),
-        (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
+        (ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
 
     // Same as above, but it will first try to reallocate the VGPRs, and choose an
     // EXEC mask and a callee depending on the success of the reallocation attempt.
     def _DVGPR : SPseudoInstSI <(outs),
-        (ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
-             SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
+        (ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
+             SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>;
   } // End FixedSize = 0 etc
 }
 
@@ -869,7 +895,7 @@ multiclass si_cs_chain_tc_pattern<
   dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
   def : GCNPat<
     (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
-    (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
+    (tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
   >;
 }
 
@@ -896,14 +922,15 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
     (AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff),
                                  execvt:$exec, i32:$numvgprs,
                                  execvt:$fbexec, i64:$fbcallee),
-    (tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
-        SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)
+    (tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
+        SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)
   >;
   }
 }
 
 defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only.
 
+let Defs = [SCC] in {
 def ADJCALLSTACKUP : SPseudoInstSI<
   (outs), (ins i32imm:$amt0, i32imm:$amt1),
   [(callseq_start timm:$amt0, timm:$amt1)],
@@ -913,7 +940,6 @@ def ADJCALLSTACKUP : SPseudoInstSI<
   let hasSideEffects = 1;
   let usesCustomInserter = 1;
   let SchedRW = [WriteSALU];
-  let Defs = [SCC];
 }
 
 def ADJCALLSTACKDOWN : SPseudoInstSI<
@@ -924,9 +950,16 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
   let hasSideEffects = 1;
   let usesCustomInserter = 1;
   let SchedRW = [WriteSALU];
-  let Defs = [SCC];
 }
 
+// Get the offset of the base of the stack, skipping any reserved areas.
+def GET_STACK_BASE : SPseudoInstSI<(outs SGPR_32:$dst), (ins),
+  [(set p5:$dst, (sponentry))]> {
+  let Size = 16; // Worst case (s_getreg, s_cmp, s_cselect + constant).
+  let SchedRW = [WriteSALU];
+}
+} // End Defs = [SCC]
+
 let Defs = [M0, EXEC, SCC],
   UseNamedOperandTable = 1 in {
 
@@ -947,7 +980,11 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
 
 def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
 def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V3 : SI_INDIRECT_SRC<VReg_96>;
 def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V5 : SI_INDIRECT_SRC<VReg_160>;
+def SI_INDIRECT_SRC_V6 : SI_INDIRECT_SRC<VReg_192>;
+def SI_INDIRECT_SRC_V7 : SI_INDIRECT_SRC<VReg_224>;
 def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
 def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>;
 def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>;
@@ -958,7 +995,11 @@ def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
 
 def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V3 : SI_INDIRECT_DST<VReg_96>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V5 : SI_INDIRECT_DST<VReg_160>;
+def SI_INDIRECT_DST_V6 : SI_INDIRECT_DST<VReg_192>;
+def SI_INDIRECT_DST_V7 : SI_INDIRECT_DST<VReg_224>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
 def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>;
 def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>;
@@ -1004,6 +1045,8 @@ def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V6 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_192>;
+def V_INDIRECT_REG_WRITE_MOVREL_B32_V7 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_224>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>;
 def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>;
@@ -1017,6 +1060,8 @@ def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<
 def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>;
 def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
 def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V6 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_192>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V7 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_224>;
 def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
 def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>;
 def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>;
@@ -1049,6 +1094,8 @@ def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VR
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_192>;
+def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_224>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>;
 def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>;
@@ -1069,6 +1116,8 @@ def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V6 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_192>;
+def V_INDIRECT_REG_READ_GPR_IDX_B32_V7 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_224>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>;
 def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>;
@@ -1319,22 +1368,22 @@ multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+    (f32 (any_f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
     (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0)
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+    (f32 (any_f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
     (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+    (f32 (any_f16_to_fp (or_oneuse i32:$src0, 0x8000))),
     (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0)
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+    (f32 (any_f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
     (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
   >;
 
@@ -1429,7 +1478,7 @@ def : GCNPat <
 
 // Don't allow source modifiers. If there are any source modifiers then it's
 // better to select fma instead of fmac.
-let SubtargetPredicate = HasFmaLegacy32 in
+let SubtargetPredicate = HasFmacLegacy32 in
 def : GCNPat <
       (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0),
                                   (VOP3NoMods f32:$src1),
@@ -2223,8 +2272,8 @@ def : GCNPat <
 
 def : GCNPat <
   (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
-  (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
-                11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0),
+  (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src,
+                !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0),
                 0, 0, 0, 0, 0)
 > {
   let SubtargetPredicate = HasPackedFP32Ops;
@@ -2238,12 +2287,34 @@ def : GCNPat <
 >;
 
 def : GCNPat <
+  (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), fp16vt:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x00008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src1)
+>;
+
+def : GCNPat <
   (fcopysign f32:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (V_LSHLREV_B32_e64 (i32 16), $src1))
 >;
 
 def : GCNPat <
+  (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), fp16vt:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x80000000)),
+             (S_LSHL_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+  (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)),
+                 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
+def : GCNPat <
   (fcopysign f64:$src0, fp16vt:$src1),
   (REG_SEQUENCE SReg_64,
     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -2258,6 +2329,18 @@ def : GCNPat <
 >;
 
 def : GCNPat <
+  (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), f32:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x00008000)),
+             (S_LSHR_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)),
+                 (V_LSHRREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
+def : GCNPat <
   (fcopysign fp16vt:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
@@ -2271,12 +2354,27 @@ def : GCNPat <
     (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
 >;
 
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+  (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1)
+>;
+
+// TODO: Handle 0 magnitude special case
 def : GCNPat <
   (fcopysign f32:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
 >;
 
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+  (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+  (REG_SEQUENCE VGPR_32,
+    (V_MOV_B16_t16_e64 0, (i16 0), 0), lo16,
+    (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1), hi16)
+>;
+
 def : GCNPat <
   (fcopysign f64:$src0, fp16vt:$src1),
   (REG_SEQUENCE VReg_64,
@@ -2292,6 +2390,13 @@ def : GCNPat <
              (V_LSHRREV_B32_e64 (i32 16), $src1)), lo16)
 >;
 
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+  (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)),
+                     0, (EXTRACT_SUBREG VGPR_32:$src1, hi16))
+>;
+
 def : GCNPat <
   (fcopysign fp16vt:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
@@ -2309,6 +2414,16 @@ def : GCNPat <
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1)
 >;
 
+def : GCNPat <
+  (UniformBinFrag<fcopysign> build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x80008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+  (fcopysign build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src1)
+>;
+
 }
 
 /********** ================== **********/
@@ -2638,12 +2753,34 @@ def : AMDGPUPat <
 >;
 
 def : AMDGPUPat <
+  (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), (f32 SReg_32:$src1)),
+  (S_AND_B32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign (f32 fpimm_pos_zero), (f32 VGPR_32:$src1)),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
+def : AMDGPUPat <
   (fcopysign f32:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
 >;
 
 def : AMDGPUPat <
+  (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), SReg_64:$src1),
+  (S_AND_B32 (i32 (S_MOV_B32 (i32 0x80000000))),
+             (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+  (fcopysign (f32 fpimm_pos_zero), VReg_64:$src1),
+  (V_AND_B32_e32 (i32 (S_MOV_B32 (i32 0x80000000))),
+                 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
   (fcopysign f64:$src0, f64:$src1),
   (REG_SEQUENCE SReg_64,
     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -2663,8 +2800,6 @@ def : AMDGPUPat <
 
 let True16Predicate = NotHasTrue16BitInsts in {
 let SubtargetPredicate = isNotGFX9Plus in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
-
 def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
           (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -2675,14 +2810,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
 } // isNotGFX9Plus
 
 let SubtargetPredicate = isGFX9GFX10 in {
-def : GCNPat <
-        (rotr i32:$src0, i32:$src1),
-        (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
-                                  /* src1_modifiers */ 0, $src0,
-                                  /* src2_modifiers */ 0,
-                                  $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
 foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
                (i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
 def : GCNPat<pat,
@@ -2704,15 +2831,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
 } // end True16Predicate = NotHasTrue16BitInsts
 
 let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat <
-  (rotr i32:$src0, i32:$src1),
-  (V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
-                          /* src1_modifiers */ 0, $src0,
-                          /* src2_modifiers */ 0,
-                          (EXTRACT_SUBREG $src1, lo16),
-                          /* clamp */ 0, /* op_sel */ 0)
->;
-
 def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
           (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2731,14 +2849,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
 } // end True16Predicate = UseRealTrue16Insts
 
 let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat <
-  (rotr i32:$src0, i32:$src1),
-  (V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
-                             /* src1_modifiers */ 0, $src0,
-                             /* src2_modifiers */ 0,
-                             $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
 def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
      (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
                                (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2784,7 +2894,11 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
 }
 
 defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern<v3f32, f32, "V3">;
 defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern<v5f32, f32, "V5">;
+defm : SI_INDIRECT_Pattern<v6f32, f32, "V6">;
+defm : SI_INDIRECT_Pattern<v7f32, f32, "V7">;
 defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
 defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">;
 defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">;
@@ -2794,7 +2908,11 @@ defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
 defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
 
 defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern<v3i32, i32, "V3">;
 defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern<v5i32, i32, "V5">;
+defm : SI_INDIRECT_Pattern<v6i32, i32, "V6">;
+defm : SI_INDIRECT_Pattern<v7i32, i32, "V7">;
 defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
 defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">;
 defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">;
@@ -2930,15 +3048,25 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i64 (zext i32:$src)),
+  (i64 (UniformUnaryFrag<zext> i32:$src)),
   (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
 >;
 
 def : GCNPat <
-  (i64 (anyext i32:$src)),
+  (i64 (zext i32:$src)),
+  (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : GCNPat <
+  (i64 (UniformUnaryFrag<anyext> i32:$src)),
   (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
 >;
 
+def : GCNPat <
+  (i64 (anyext i32:$src)),
+  (REG_SEQUENCE VReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
 class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
   (i64 (ext i1:$src)),
     (REG_SEQUENCE VReg_64,
@@ -3459,10 +3587,7 @@ def : GCNPat<
 // If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
 let AddedComplexity = 8 in {
 foreach vt = [f16, v2f16, f32, v2f32, f64] in {
-  def : GCNPat<
-    (fcanonicalize (vt is_canonicalized:$src)),
-    (COPY vt:$src)
-  >;
+  def : GCNPat<(fcanonicalize_canonicalized vt:$src), (COPY vt:$src)>;
 }
 }
 
@@ -3481,30 +3606,6 @@ def : GCNPat<
 >;
 } // End True16Predicate
 
-let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat<
-  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
-  (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-
-def : GCNPat<
-  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
-  (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-} // End True16Predicate
-
-let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat<
-  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
-  (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
->;
-
-def : GCNPat<
-  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
-  (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
->;
-} // End True16Predicate
-
 def : GCNPat<
   (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
   (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
@@ -3663,8 +3764,6 @@ def : GCNPat <
                   SRCMODS.NONE, $src2)
 >;
 
-// COPY is workaround tablegen bug from multiple outputs
-// from S_LSHL_B32's multiple outputs from implicit scc def.
 let AddedComplexity = 1 in {
 def : GCNPat <
   (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
@@ -3683,7 +3782,7 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+  (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 fpimm_pos_zero))),
   (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
 >;
 }
@@ -3694,7 +3793,7 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 fpimm_pos_zero))),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
@@ -3879,9 +3978,6 @@ let SubtargetPredicate = isGFX9Plus in {
 let True16Predicate = NotHasTrue16BitInsts in
   def : PackB32Pat<V_PACK_B32_F16_e64>;
 
-let True16Predicate = UseRealTrue16Insts in
-  def : PackB32Pat<V_PACK_B32_F16_t16_e64>;
-
 let True16Predicate = UseFakeTrue16Insts in
   def : PackB32Pat<V_PACK_B32_F16_fake16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
@@ -4551,6 +4647,7 @@ def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
@@ -4737,6 +4834,23 @@ def G_AMDGPU_READANYLANE : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+// llvm.sponentry
+def G_AMDGPU_SPONENTRY : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins);
+  let hasSideEffects = 0;
+}
+
+class LoadMonitorInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins ptype1:$ptr);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+def G_AMDGPU_FLAT_LOAD_MONITOR   : LoadMonitorInstruction;
+def G_AMDGPU_GLOBAL_LOAD_MONITOR : LoadMonitorInstruction;
+
 //============================================================================//
 // Dummy Instructions
 //============================================================================//
@@ -4749,3 +4863,14 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
   let hasSideEffects = 1;
   let SubtargetPredicate = isGFX10Plus;
 }
+
+defvar VGPR32_Ptr_Opcodes = [LOAD_STACK_GUARD];
+defvar VGPR64_Ptr_Opcodes = !listremove(PseudosWithPtrOps, VGPR32_Ptr_Opcodes);
+
+foreach inst = VGPR32_Ptr_Opcodes in {
+  def : RemapPointerOperands<inst, VGPR_32>;
+}
+
+foreach inst = VGPR64_Ptr_Opcodes in {
+  def : RemapPointerOperands<inst, VReg_64_AlignTarget>;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 6537b79..83cf457 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/InitializePasses.h"
 
@@ -32,6 +33,7 @@ private:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineDominatorTree *MDT;
+  MachineLoopInfo *MLI;
   const AMDGPU::LaneMaskConstants &LMC;
 
   void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
@@ -39,9 +41,10 @@ private:
   void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
 
 public:
-  SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT)
+  SILateBranchLowering(const GCNSubtarget &ST, MachineDominatorTree *MDT,
+                       MachineLoopInfo *MLI)
       : ST(ST), TII(ST.getInstrInfo()), TRI(&TII->getRegisterInfo()), MDT(MDT),
-        LMC(AMDGPU::LaneMaskConstants::get(ST)) {}
+        MLI(MLI), LMC(AMDGPU::LaneMaskConstants::get(ST)) {}
 
   bool run(MachineFunction &MF);
 };
@@ -54,7 +57,9 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override {
     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-    return SILateBranchLowering(ST, MDT).run(MF);
+    auto *MLIWP = getAnalysisIfAvailable<MachineLoopInfoWrapperPass>();
+    MachineLoopInfo *MLI = MLIWP ? &MLIWP->getLI() : nullptr;
+    return SILateBranchLowering(ST, MDT, MLI).run(MF);
   }
 
   StringRef getPassName() const override {
@@ -64,6 +69,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addPreserved<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineLoopInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -117,7 +123,7 @@ static void generateEndPgm(MachineBasicBlock &MBB,
 }
 
 static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
-                       MachineDominatorTree *MDT) {
+                       MachineDominatorTree *MDT, MachineLoopInfo *MLI) {
   MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
 
   // Update dominator tree
@@ -129,6 +135,12 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
   }
   DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
   MDT->applyUpdates(DTUpdates);
+
+  // Update loop info if available
+  if (MLI) {
+    if (MachineLoop *Loop = MLI->getLoopFor(&MBB))
+      Loop->addBasicBlockToLoop(SplitBB, *MLI);
+  }
 }
 
 static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB,
@@ -186,20 +198,20 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
   for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
     MI.removeOperand(OpIdx);
 
-  MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
+  MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN));
 }
 
 void SILateBranchLowering::earlyTerm(MachineInstr &MI,
                                      MachineBasicBlock *EarlyExitBlock) {
   MachineBasicBlock &MBB = *MI.getParent();
-  const DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
                       .addMBB(EarlyExitBlock);
   auto Next = std::next(MI.getIterator());
 
   if (Next != MBB.end() && !Next->isTerminator())
-    splitBlock(MBB, *BranchMI, MDT);
+    splitBlock(MBB, *BranchMI, MDT, MLI);
 
   MBB.addSuccessor(EarlyExitBlock);
   MDT->insertEdge(&MBB, EarlyExitBlock);
@@ -210,11 +222,14 @@ llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
                                     MachineFunctionAnalysisManager &MFAM) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
-  if (!SILateBranchLowering(ST, MDT).run(MF))
+  auto *MLI = MFAM.getCachedResult<MachineLoopAnalysis>(MF);
+  if (!SILateBranchLowering(ST, MDT, MLI).run(MF))
     return PreservedAnalyses::all();
 
-  return getMachineFunctionPassPreservedAnalyses()
-      .preserve<MachineDominatorTreeAnalysis>();
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachineLoopAnalysis>();
+  return PA;
 }
 
 bool SILateBranchLowering::run(MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f0d1117..0141c36 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -193,6 +193,8 @@ class SILoadStoreOptimizer {
 
     unsigned LoSubReg = 0;
     unsigned HiSubReg = 0;
+    // True when using V_ADD_U64_e64 pattern
+    bool UseV64Pattern = false;
   };
 
   struct MemAddress {
@@ -233,10 +235,11 @@ private:
 
   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
                       MachineBasicBlock::iterator InsertBefore,
-                      AMDGPU::OpName OpName, Register DestReg) const;
+                      const DebugLoc &DL, AMDGPU::OpName OpName,
+                      Register DestReg) const;
   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
                            MachineBasicBlock::iterator InsertBefore,
-                           AMDGPU::OpName OpName) const;
+                           const DebugLoc &DL, AMDGPU::OpName OpName) const;
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
@@ -278,9 +281,12 @@ private:
 
   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                            int32_t NewOffset) const;
+  void updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const;
   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
-  std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
+  bool processBaseWithConstOffset64(MachineInstr *AddDef,
+                                    const MachineOperand &Base,
+                                    MemAddress &Addr) const;
   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
   /// Promotes constant offset to the immediate by adjusting the base. It
   /// tries to use a base from the nearby instructions that allows it to have
@@ -1336,11 +1342,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
     int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
                                               AMDGPU::OpName::data1);
 
-    const TargetRegisterClass *DataRC0 =
-        TII->getRegClass(Write2Opc, Data0Idx, TRI);
+    const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);
 
-    const TargetRegisterClass *DataRC1 =
-        TII->getRegClass(Write2Opc, Data1Idx, TRI);
+    const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);
 
     if (unsigned SubReg = Data0->getSubReg()) {
       DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
@@ -1367,10 +1371,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
 // Paired.
 void SILoadStoreOptimizer::copyToDestRegs(
     CombineInfo &CI, CombineInfo &Paired,
-    MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
-    Register DestReg) const {
+    MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
+    AMDGPU::OpName OpName, Register DestReg) const {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
 
   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
 
@@ -1387,7 +1390,7 @@ void SILoadStoreOptimizer::copyToDestRegs(
 
   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
-      .addReg(DestReg, 0, SubRegIdx0);
+      .addReg(DestReg, {}, SubRegIdx0);
   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest1)
       .addReg(DestReg, RegState::Kill, SubRegIdx1);
@@ -1398,9 +1401,9 @@ void SILoadStoreOptimizer::copyToDestRegs(
 Register
 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
                                       MachineBasicBlock::iterator InsertBefore,
+                                      const DebugLoc &DL,
                                       AMDGPU::OpName OpName) const {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
 
   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
 
@@ -1456,11 +1459,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   Register DestReg = MRI->createVirtualRegister(SuperRC);
 
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
-  unsigned BaseRegFlags = 0;
+  RegState BaseRegFlags = {};
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
@@ -1471,7 +1475,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
 
     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
-        .addReg(AddrReg->getReg(), 0, BaseSubReg)
+        .addReg(AddrReg->getReg(), {}, BaseSubReg)
         .addImm(0); // clamp bit
     BaseSubReg = 0;
   }
@@ -1484,7 +1488,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1541,11 +1545,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
   const MCInstrDesc &Write2Desc = TII->get(Opc);
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
-  unsigned BaseRegFlags = 0;
+  RegState BaseRegFlags = {};
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
@@ -1556,7 +1561,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
 
     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
-        .addReg(AddrReg->getReg(), 0, BaseSubReg)
+        .addReg(AddrReg->getReg(), {}, BaseSubReg)
         .addImm(0); // clamp bit
     BaseSubReg = 0;
   }
@@ -1582,7 +1587,9 @@ MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
                                      MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1607,7 +1614,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
 
   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1618,7 +1625,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1639,7 +1648,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
   New.addImm(MergedOffset);
   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1650,7 +1659,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1680,7 +1691,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
         .addImm(0)            // swz
         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1691,7 +1702,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1731,7 +1744,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
           .addImm(0)            // swz
           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1742,12 +1755,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
@@ -1789,7 +1803,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1807,7 +1823,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
        .addImm(CI.CPol)
        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1818,12 +1834,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
@@ -2094,12 +2112,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
@@ -2149,8 +2168,35 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
                                            const MemAddress &Addr) const {
   MachineBasicBlock *MBB = MI.getParent();
   MachineBasicBlock::iterator MBBI = MI.getIterator();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
 
+  // Use V_ADD_U64_e64 when the original pattern used it (gfx1250+)
+  if (Addr.Base.UseV64Pattern) {
+    Register FullDestReg = MRI->createVirtualRegister(
+        TII->getRegClass(TII->get(AMDGPU::V_ADD_U64_e64), 0));
+
+    // Load the 64-bit offset into an SGPR pair if needed
+    Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstr *MovOffset =
+        BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
+                OffsetReg)
+            .addImm(Addr.Offset);
+    MachineInstr *Add64 =
+        BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_U64_e64), FullDestReg)
+            .addReg(Addr.Base.LoReg)
+            .addReg(OffsetReg, RegState::Kill)
+            .addImm(0);
+    (void)MovOffset;
+    (void)Add64;
+    LLVM_DEBUG(dbgs() << "    " << *MovOffset << "\n";
+               dbgs() << "    " << *Add64 << "\n\n";);
+
+    return FullDestReg;
+  }
+
+  // Original carry-chain pattern (V_ADD_CO_U32 + V_ADDC_U32)
   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
           Addr.Base.LoSubReg) &&
          "Expected 32-bit Base-Register-Low!!");
@@ -2159,7 +2205,6 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
           Addr.Base.HiSubReg) &&
          "Expected 32-bit Base-Register-Hi!!");
 
-  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
   MachineOperand OffsetHi =
     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
@@ -2171,23 +2216,19 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   MachineInstr *LoHalf =
-    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
-      .addReg(CarryReg, RegState::Define)
-      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
-      .add(OffsetLo)
-      .addImm(0); // clamp bit
-  (void)LoHalf;
-  LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
+      BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
+          .addReg(CarryReg, RegState::Define)
+          .addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg)
+          .add(OffsetLo)
+          .addImm(0); // clamp bit
 
   MachineInstr *HiHalf =
-  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
-    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
-    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
-    .add(OffsetHi)
-    .addReg(CarryReg, RegState::Kill)
-    .addImm(0); // clamp bit
-  (void)HiHalf;
-  LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
+      BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
+          .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+          .addReg(Addr.Base.HiReg, {}, Addr.Base.HiSubReg)
+          .add(OffsetHi)
+          .addReg(CarryReg, RegState::Kill)
+          .addImm(0); // clamp bit
 
   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
   MachineInstr *FullBase =
@@ -2196,8 +2237,13 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
       .addImm(AMDGPU::sub0)
       .addReg(DestSub1)
       .addImm(AMDGPU::sub1);
+
+  (void)LoHalf;
+  (void)HiHalf;
   (void)FullBase;
-  LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
+  LLVM_DEBUG(dbgs() << "    " << *LoHalf << "\n";
+             dbgs() << "    " << *HiHalf << "\n";
+             dbgs() << "    " << *FullBase << "\n\n";);
 
   return FullDestReg;
 }
@@ -2212,20 +2258,33 @@ void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
 }
 
-std::optional<int32_t>
-SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
-  if (Op.isImm())
-    return Op.getImm();
+// Helper to extract a 64-bit constant offset from a V_ADD_U64_e64 instruction.
+// Returns true if successful, populating Addr with base register info and
+// offset.
+bool SILoadStoreOptimizer::processBaseWithConstOffset64(
+    MachineInstr *AddDef, const MachineOperand &Base, MemAddress &Addr) const {
+  if (!Base.isReg())
+    return false;
+
+  MachineOperand *Src0 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1);
 
-  if (!Op.isReg())
-    return std::nullopt;
+  const MachineOperand *BaseOp = nullptr;
 
-  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
-  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
-      !Def->getOperand(1).isImm())
-    return std::nullopt;
+  auto Offset = TII->getImmOrMaterializedImm(*Src1);
 
-  return Def->getOperand(1).getImm();
+  if (Offset) {
+    BaseOp = Src0;
+    Addr.Offset = *Offset;
+  } else {
+    // Both or neither are constants - can't handle this pattern
+    return false;
+  }
+
+  // Now extract the base register (which should be a 64-bit VGPR).
+  Addr.Base.LoReg = BaseOp->getReg();
+  Addr.Base.UseV64Pattern = true;
+  return true;
 }
 
 // Analyze Base and extracts:
@@ -2238,14 +2297,27 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
 //   %Base:vreg_64 =
 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
+//
+// Also handles V_ADD_U64_e64 pattern (gfx1250+):
+//   %OFFSET:sreg_64 = S_MOV_B64_IMM_PSEUDO 256
+//   %Base:vreg_64 = V_ADD_U64_e64 %BASE:vreg_64, %OFFSET:sreg_64, 0
 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
                                                       MemAddress &Addr) const {
   if (!Base.isReg())
     return;
 
   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
-  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
-      || Def->getNumOperands() != 5)
+  if (!Def)
+    return;
+
+  // Try V_ADD_U64_e64 pattern first (simpler, used on gfx1250+)
+  if (Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {
+    if (processBaseWithConstOffset64(Def, Base, Addr))
+      return;
+  }
+
+  // Fall through to REG_SEQUENCE + V_ADD_CO_U32 + V_ADDC_U32 pattern
+  if (Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5)
     return;
 
   MachineOperand BaseLo = Def->getOperand(1);
@@ -2260,14 +2332,14 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
     return;
 
-  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
-  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
+  MachineOperand *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
+  MachineOperand *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
 
-  auto Offset0P = extractConstOffset(*Src0);
+  auto Offset0P = TII->getImmOrMaterializedImm(*Src0);
   if (Offset0P)
     BaseLo = *Src1;
   else {
-    if (!(Offset0P = extractConstOffset(*Src1)))
+    if (!(Offset0P = TII->getImmOrMaterializedImm(*Src1)))
       return;
     BaseLo = *Src0;
   }
@@ -2297,6 +2369,32 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
 }
 
+// Maintain the correct LDS address for async loads and stores.
+// It becomes incorrect when promoteConstantOffsetToImm adds an offset only
+// meant for the global address operand. For async loads the LDS address is in
+// vdst. For async stores, the LDS address is in vdata.
+void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
+                                                 int32_t OffsetDiff) const {
+  if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
+    return;
+
+  MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  if (!LDSAddr)
+    LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+  assert(LDSAddr);
+
+  Register OldReg = LDSAddr->getReg();
+  Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg)
+      .addReg(OldReg)
+      .addImm(-OffsetDiff)
+      .addImm(0);
+
+  LDSAddr->setReg(NewReg);
+}
+
 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
     MachineInstr &MI,
     MemInfoMap &Visited,
@@ -2426,7 +2524,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
     // Instead of moving up, just re-compute anchor-instruction's base address.
     Register Base = computeBase(MI, AnchorAddr);
 
-    updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
+    int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;
+    updateBaseAndOffset(MI, Base, OffsetDiff);
+    updateAsyncLDSAddress(MI, OffsetDiff);
     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
 
     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
@@ -2437,7 +2537,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
                    OtherMI->dump());
-        updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
+        int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;
+        updateBaseAndOffset(*OtherMI, Base, OtherOffsetDiff);
+        updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff);
         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 8586d6c..9cc86e8 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -56,6 +56,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
@@ -160,6 +161,7 @@ public:
     AU.addPreserved<SlotIndexesWrapperPass>();
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<LiveVariablesWrapperPass>();
+    AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -742,6 +744,11 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
   if (PDT)
     PDT->applyUpdates(DTUpdates);
 
+  if (MDT && MDT->getNode(&MBB))
+    MDT->eraseNode(&MBB);
+  if (PDT && PDT->getNode(&MBB))
+    PDT->eraseNode(&MBB);
+
   MBB.clear();
   MBB.eraseFromParent();
   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
@@ -880,5 +887,6 @@ SILowerControlFlowPass::run(MachineFunction &MF,
   PA.preserve<SlotIndexesAnalysis>();
   PA.preserve<LiveIntervalsAnalysis>();
   PA.preserve<LiveVariablesAnalysis>();
+  PA.preserve<MachineBlockFrequencyAnalysis>();
   return PA;
 }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 96131bd..0b8c71a 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -75,7 +75,7 @@ Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
 bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
   assert(Changed || ConstrainRegs.empty());
   for (Register Reg : ConstrainRegs)
-    MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
+    MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass());
   ConstrainRegs.clear();
 
   return Changed;
@@ -417,7 +417,7 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
 
       // Copy into a 32-bit vector register.
       LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
-      DebugLoc DL = MI.getDebugLoc();
+      const DebugLoc &DL = MI.getDebugLoc();
 
       assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
       assert(!MI.getOperand(0).getSubReg());
@@ -616,7 +616,7 @@ bool Vreg1LoweringHelper::lowerCopiesToI1() {
       if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
         continue;
 
-      DebugLoc DL = MI.getDebugLoc();
+      const DebugLoc &DL = MI.getDebugLoc();
       Register SrcReg = MI.getOperand(1).getReg();
       assert(!MI.getOperand(1).getSubReg());
 
@@ -881,18 +881,14 @@ SILowerI1CopiesPass::run(MachineFunction &MF,
     return PreservedAnalyses::all();
 
   // TODO: Probably preserves most.
-  PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
-  return PA;
+  return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
 }
 
 class SILowerI1CopiesLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SILowerI1CopiesLegacy() : MachineFunctionPass(ID) {
-    initializeSILowerI1CopiesLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  SILowerI1CopiesLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 40eeeb8..cbd08f0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
   MachineFunction &MF = *SaveBlock.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *RI = ST.getRegisterInfo();
 
   MachineBasicBlock::iterator I = SaveBlock.begin();
-  if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+  if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) {
     for (const CalleeSavedInfo &CS : CSI) {
       // Insert the spill to the stack frame.
       MCRegister Reg = CS.getReg();
 
       MachineInstrSpan MIS(I, &SaveBlock);
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+      const TargetRegisterClass *RC = RI->getMinimalPhysRegClass(
           Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
 
       // If this value was already livein, we probably have a direct use of the
       // incoming register value, so don't kill at the spill point. This happens
       // since we pass some special inputs (workgroup IDs) in the callee saved
       // range.
-      const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI);
+      const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI);
       TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
-                              RC, TRI, Register());
+                              RC, Register());
 
       if (Indexes) {
         assert(std::distance(MIS.begin(), I) == 1);
diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
index ef384c2..4aa4186 100644
--- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
@@ -53,9 +53,7 @@ class SILowerWWMCopiesLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SILowerWWMCopiesLegacy() : MachineFunctionPass(ID) {
-    initializeSILowerWWMCopiesLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  SILowerWWMCopiesLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b398db4..af3226d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -37,7 +37,7 @@ static cl::opt<bool, true> MFMAVGPRFormOpt(
     "amdgpu-mfma-vgpr-form",
     cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
              "unspecified, default to compiler heuristics"),
-    cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false),
+    cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(true),
     cl::Hidden);
 
 const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
@@ -114,7 +114,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     FrameOffsetReg = AMDGPU::SGPR33;
     StackPtrOffsetReg = AMDGPU::SGPR32;
 
-    if (!ST.enableFlatScratch()) {
+    if (!ST.hasFlatScratchEnabled()) {
       // Non-entry functions have no special inputs for now, other registers
       // required for scratch access.
       ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
@@ -169,7 +169,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     if (WorkItemIDZ)
       WorkItemIDY = true;
 
-    if (!ST.flatScratchIsArchitected()) {
+    if (!ST.hasArchitectedFlatScratch()) {
       PrivateSegmentWaveByteOffset = true;
 
       // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
@@ -692,11 +692,10 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
     if (Arg.isMasked())
       SA.Mask = Arg.getMask();
 
-    A = SA;
+    A = std::move(SA);
     return true;
   };
 
-  // TODO: Need to serialize kernarg preloads.
   bool Any = false;
   Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
   Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
@@ -718,6 +717,21 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
   Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
   Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
 
+  // Write FirstKernArgPreloadReg separately, since it's a Register,
+  // not ArgDescriptor.
+  if (ArgInfo.FirstKernArgPreloadReg) {
+    Register Reg = ArgInfo.FirstKernArgPreloadReg;
+    assert(Reg.isPhysical() &&
+           "FirstKernArgPreloadReg must be a physical register");
+
+    yaml::SIArgument SA = yaml::SIArgument::createArgument(true);
+    raw_string_ostream OS(SA.RegisterName.Value);
+    OS << printReg(Reg, &TRI);
+
+    AI.FirstKernArgPreloadReg = SA;
+    Any = true;
+  }
+
   if (Any)
     return AI;
 
@@ -730,9 +744,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
     : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
       MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
       GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
-      IsEntryFunction(MFI.isEntryFunction()),
-      NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
-      MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
+      IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()),
+      WaveLimiter(MFI.needsWaveLimiter()),
       HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
       HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
       NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
@@ -750,7 +763,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
       IsWholeWaveFunction(MFI.isWholeWaveFunction()),
       DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
-      ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
+      ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
+      NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) {
   for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
     SpillPhysVGPRS.push_back(regToString(Reg, TRI));
 
@@ -788,7 +802,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
   Occupancy = YamlMFI.Occupancy;
   IsEntryFunction = YamlMFI.IsEntryFunction;
-  NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
   MemoryBound = YamlMFI.MemoryBound;
   WaveLimiter = YamlMFI.WaveLimiter;
   HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
@@ -799,6 +812,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   ReturnsVoid = YamlMFI.ReturnsVoid;
   IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
 
+  UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs);
+
   if (YamlMFI.ScavengeFI) {
     auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
     if (!FIOrErr) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2c1a13c..617862d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -170,6 +170,7 @@ struct SIArgumentInfo {
   std::optional<SIArgument> DispatchID;
   std::optional<SIArgument> FlatScratchInit;
   std::optional<SIArgument> PrivateSegmentSize;
+  std::optional<SIArgument> FirstKernArgPreloadReg;
 
   std::optional<SIArgument> WorkGroupIDX;
   std::optional<SIArgument> WorkGroupIDY;
@@ -195,6 +196,7 @@ template <> struct MappingTraits<SIArgumentInfo> {
     YamlIO.mapOptional("dispatchID", AI.DispatchID);
     YamlIO.mapOptional("flatScratchInit", AI.FlatScratchInit);
     YamlIO.mapOptional("privateSegmentSize", AI.PrivateSegmentSize);
+    YamlIO.mapOptional("firstKernArgPreloadReg", AI.FirstKernArgPreloadReg);
 
     YamlIO.mapOptional("workGroupIDX", AI.WorkGroupIDX);
     YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
@@ -265,7 +267,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   Align DynLDSAlign;
   bool IsEntryFunction = false;
   bool IsChainFunction = false;
-  bool NoSignedZerosFPMath = false;
   bool MemoryBound = false;
   bool WaveLimiter = false;
   bool HasSpilledSGPRs = false;
@@ -305,13 +306,15 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   unsigned DynamicVGPRBlockSize = 0;
   unsigned ScratchReservedForDynamicVGPRs = 0;
 
+  unsigned NumKernargPreloadSGPRs = 0;
+
   SIMachineFunctionInfo() = default;
   SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
                         const TargetRegisterInfo &TRI,
                         const llvm::MachineFunction &MF);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~SIMachineFunctionInfo() = default;
+  ~SIMachineFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<SIMachineFunctionInfo> {
@@ -324,7 +327,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
     YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
     YamlIO.mapOptional("isChainFunction", MFI.IsChainFunction, false);
-    YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
     YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
     YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
     YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
@@ -361,6 +363,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
     YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
                        MFI.ScratchReservedForDynamicVGPRs, 0);
+    YamlIO.mapOptional("numKernargPreloadSGPRs", MFI.NumKernargPreloadSGPRs, 0);
     YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
   }
 };
@@ -1014,7 +1017,9 @@ public:
   void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
 
   Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
-    return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
+    if (ArgInfo.PrivateSegmentWaveByteOffset)
+      return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
+    return MCRegister();
   }
 
   /// Returns the physical register reserved for use as the resource
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index fd28abe..fb0c7e6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -323,8 +323,8 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
 
   // Do not Track Physical Registers, because it messes up.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
-    if (RegMaskPair.RegUnit.isVirtual())
-      LiveInRegs.insert(RegMaskPair.RegUnit);
+    if (RegMaskPair.VRegOrUnit.isVirtualReg())
+      LiveInRegs.insert(RegMaskPair.VRegOrUnit.asVirtualReg());
   }
   LiveOutRegs.clear();
   // There is several possibilities to distinguish:
@@ -350,12 +350,13 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
   // The use of findDefBetween removes the case 4.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
-    Register Reg = RegMaskPair.RegUnit;
-    if (Reg.isVirtual() &&
-        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+    VirtRegOrUnit VRegOrUnit = RegMaskPair.VRegOrUnit;
+    if (VRegOrUnit.isVirtualReg() &&
+        isDefBetween(VRegOrUnit.asVirtualReg(),
+                     LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
                      LIS)) {
-      LiveOutRegs.insert(Reg);
+      LiveOutRegs.insert(VRegOrUnit.asVirtualReg());
     }
   }
 
@@ -578,11 +579,11 @@ void SIScheduleBlock::printDebug(bool full) {
            << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
     dbgs() << "LiveIns:\n";
     for (Register Reg : LiveInRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+      dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
 
     dbgs() << "\nLiveOuts:\n";
     for (Register Reg : LiveOutRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+      dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
   }
 
   dbgs() << "\nInstructions:\n";
@@ -921,7 +922,7 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
          // combination of children.
       PendingColoring[SU->NodeNum] = NextNonReservedID++;
   }
-  CurrentColoring = PendingColoring;
+  CurrentColoring = std::move(PendingColoring);
 }
 
 
@@ -1446,23 +1447,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   }
 #endif
 
-  std::set<Register> InRegs = DAG->getInRegs();
+  std::set<VirtRegOrUnit> InRegs = DAG->getInRegs();
   addLiveRegs(InRegs);
 
   // Increase LiveOutRegsNumUsages for blocks
   // producing registers consumed in another
   // scheduling region.
-  for (Register Reg : DAG->getOutRegs()) {
+  for (VirtRegOrUnit VRegOrUnit : DAG->getOutRegs()) {
     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
       // Do reverse traversal
       int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
       SIScheduleBlock *Block = Blocks[ID];
       const std::set<Register> &OutRegs = Block->getOutRegs();
 
-      if (OutRegs.find(Reg) == OutRegs.end())
+      if (!VRegOrUnit.isVirtualReg() ||
+          OutRegs.find(VRegOrUnit.asVirtualReg()) == OutRegs.end())
         continue;
 
-      ++LiveOutRegsNumUsages[ID][Reg];
+      ++LiveOutRegsNumUsages[ID][VRegOrUnit.asVirtualReg()];
       break;
     }
   }
@@ -1565,15 +1567,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
     maxVregUsage = VregCurrentUsage;
   if (SregCurrentUsage > maxSregUsage)
     maxSregUsage = SregCurrentUsage;
-  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
-             for (SIScheduleBlock *Block : ReadyBlocks)
-               dbgs() << Block->getID() << ' ';
-             dbgs() << "\nCurrent Live:\n";
-             for (Register Reg : LiveRegs)
-               dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
-             dbgs() << '\n';
-             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
-             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
+  LLVM_DEBUG({
+    dbgs() << "Picking New Blocks\n";
+    dbgs() << "Available: ";
+    for (SIScheduleBlock *Block : ReadyBlocks)
+      dbgs() << Block->getID() << ' ';
+    dbgs() << "\nCurrent Live:\n";
+    for (Register Reg : LiveRegs)
+      dbgs() << printReg(Reg, DAG->getTRI()) << ' ';
+    dbgs() << '\n';
+    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+  });
 
   Cand.Block = nullptr;
   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
@@ -1625,13 +1630,13 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
 
 // Tracking of currently alive registers to determine VGPR Usage.
 
-void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) {
-  for (Register Reg : Regs) {
+void SIScheduleBlockScheduler::addLiveRegs(std::set<VirtRegOrUnit> &Regs) {
+  for (VirtRegOrUnit VRegOrUnit : Regs) {
     // For now only track virtual registers.
-    if (!Reg.isVirtual())
+    if (!VRegOrUnit.isVirtualReg())
       continue;
     // If not already in the live set, then add it.
-    (void) LiveRegs.insert(Reg);
+    (void)LiveRegs.insert(VRegOrUnit.asVirtualReg());
   }
 }
 
@@ -1662,7 +1667,7 @@ void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
 
 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
   decreaseLiveRegs(Block, Block->getInRegs());
-  addLiveRegs(Block->getOutRegs());
+  LiveRegs.insert(Block->getOutRegs().begin(), Block->getOutRegs().end());
   releaseBlockSuccs(Block);
   for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
     // We produce this register, thus it must not be previously alive.
@@ -1689,7 +1694,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
       continue;
     if (LiveRegsConsumers[Reg] > 1)
       continue;
-    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
     for (; PSetI.isValid(); ++PSetI) {
       DiffSetPressure[*PSetI] -= PSetI.getWeight();
     }
@@ -1699,7 +1704,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
     // For now only track virtual registers.
     if (!Reg.isVirtual())
       continue;
-    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(VirtRegOrUnit(Reg));
     for (; PSetI.isValid(); ++PSetI) {
       DiffSetPressure[*PSetI] += PSetI.getWeight();
     }
@@ -1846,7 +1851,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
     // For now only track virtual registers
     if (!Reg.isVirtual())
       continue;
-    PSetIterator PSetI = MRI.getPressureSets(Reg);
+    PSetIterator PSetI = MRI.getPressureSets(VirtRegOrUnit(Reg));
     for (; PSetI.isValid(); ++PSetI) {
       if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
         VgprUsage += PSetI.getWeight();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index b219cbd..1245774 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -389,7 +389,7 @@ private:
                             SIBlockSchedCandidate &TryCand);
   SIScheduleBlock *pickBlock();
 
-  void addLiveRegs(std::set<Register> &Regs);
+  void addLiveRegs(std::set<VirtRegOrUnit> &Regs);
   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<Register> &Regs);
   void releaseBlockSuccs(SIScheduleBlock *Parent);
   void blockScheduled(SIScheduleBlock *Block);
@@ -462,18 +462,18 @@ public:
                                                      unsigned &VgprUsage,
                                                      unsigned &SgprUsage);
 
-  std::set<Register> getInRegs() {
-    std::set<Register> InRegs;
+  std::set<VirtRegOrUnit> getInRegs() {
+    std::set<VirtRegOrUnit> InRegs;
     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
-      InRegs.insert(RegMaskPair.RegUnit);
+      InRegs.insert(RegMaskPair.VRegOrUnit);
     }
     return InRegs;
   }
 
-  std::set<unsigned> getOutRegs() {
-    std::set<unsigned> OutRegs;
+  std::set<VirtRegOrUnit> getOutRegs() {
+    std::set<VirtRegOrUnit> OutRegs;
     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
-      OutRegs.insert(RegMaskPair.RegUnit);
+      OutRegs.insert(RegMaskPair.VRegOrUnit);
     }
     return OutRegs;
   };
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 484861d..0daeecd 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/TargetParser/TargetParser.h"
 
@@ -277,6 +278,12 @@ public:
   /// rmw operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
+
+  /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
+  /// along with an indication of whether this is a load or store. If it is not
+  /// a direct-to-LDS operation, returns std::nullopt.
+  std::optional<SIMemOpInfo>
+  getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
 };
 
 class SICacheControl {
@@ -295,16 +302,17 @@ protected:
 
   SICacheControl(const GCNSubtarget &ST);
 
-  /// Sets named bit \p BitName to "true" if present in instruction \p MI.
+  /// Sets CPol \p Bits to "true" if present in instruction \p MI.
   /// \returns Returns true if \p MI is modified, false otherwise.
-  bool enableNamedBit(const MachineBasicBlock::iterator MI,
-                      AMDGPU::CPol::CPol Bit) const;
+  bool enableCPolBits(const MachineBasicBlock::iterator MI,
+                      unsigned Bits) const;
 
   /// Check if any atomic operation on AS can affect memory accessible via the
   /// global address space.
   bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
 
 public:
+  using CPol = AMDGPU::CPol::CPol;
 
   /// Create a cache control for the subtarget \p ST.
   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
@@ -360,11 +368,13 @@ public:
   /// between memory instructions to enforce the order they become visible as
   /// observed by other memory instructions executing in memory scope \p Scope.
   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
-  /// address spaces. Returns true iff any instructions inserted.
+  /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
+  /// that are used by atomic instructions.
+  /// Returns true iff any instructions inserted.
   virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                           SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                           bool IsCrossAddrSpaceOrdering, Position Pos,
-                          AtomicOrdering Order) const = 0;
+                          AtomicOrdering Order, bool AtomicsOnly) const = 0;
 
   /// Inserts any necessary instructions at position \p Pos relative to
   /// instruction \p MI to ensure any subsequent memory instructions of this
@@ -388,31 +398,17 @@ public:
                              bool IsCrossAddrSpaceOrdering,
                              Position Pos) const = 0;
 
-  /// Inserts any necessary instructions before the barrier start instruction
-  /// \p MI in order to support pairing of barriers and fences.
-  virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
-    return false;
-  };
+  /// Handle operations that are considered non-volatile.
+  /// See \ref isNonVolatileMemoryAccess
+  virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
 
   /// Virtual destructor to allow derivations to be deleted.
   virtual ~SICacheControl() = default;
 };
 
-class SIGfx6CacheControl : public SICacheControl {
-protected:
-
-  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::GLC);
-  }
-
-  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::SLC);
-  }
-
+/// Generates code sequences for the memory model of all GFX targets below
+/// GFX10.
+class SIGfx6CacheControl final : public SICacheControl {
 public:
 
   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -437,7 +433,7 @@ public:
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
+                  AtomicOrdering Order, bool AtomicsOnly) const override;
 
   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
@@ -451,30 +447,26 @@ public:
                      Position Pos) const override;
 };
 
-class SIGfx7CacheControl : public SIGfx6CacheControl {
-public:
-
-  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
-
-  bool insertAcquire(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     Position Pos) const override;
-
-};
-
-class SIGfx90ACacheControl : public SIGfx7CacheControl {
+/// Generates code sequences for the memory model of GFX10/11.
+class SIGfx10CacheControl final : public SICacheControl {
 public:
-
-  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
+  SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
 
   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                              SIAtomicScope Scope,
                              SIAtomicAddrSpace AddrSpace) const override;
 
+  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+                              SIAtomicScope Scope,
+                              SIAtomicAddrSpace AddrSpace) const override {
+    return false;
+  }
+
   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
                             SIAtomicScope Scope,
-                            SIAtomicAddrSpace AddrSpace) const override;
+                            SIAtomicAddrSpace AddrSpace) const override {
+    return false;
+  }
 
   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -484,124 +476,27 @@ public:
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
-
-  bool insertAcquire(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     Position Pos) const override;
-
-  bool insertRelease(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     bool IsCrossAddrSpaceOrdering,
-                     Position Pos) const override;
-};
-
-class SIGfx940CacheControl : public SIGfx90ACacheControl {
-protected:
-
-  /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::SC0);
-  }
-
-  /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::SC1);
-  }
-
-  /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::NT);
-  }
-
-public:
-  SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
-
-  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
-                              SIAtomicScope Scope,
-                              SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
-                            SIAtomicScope Scope,
-                            SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsVolatile, bool IsNonTemporal,
-                                      bool IsLastUse) const override;
+                  AtomicOrdering Order, bool AtomicsOnly) const override;
 
   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
 
   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
-                     Position Pos) const override;
-};
-
-class SIGfx10CacheControl : public SIGfx7CacheControl {
-protected:
-
-  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
-  /// is modified, false otherwise.
-  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
-    return enableNamedBit(MI, AMDGPU::CPol::DLC);
+                     Position Pos) const override {
+    return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                      IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+                      /*AtomicsOnly=*/false);
   }
-
-public:
-
-  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
-
-  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsVolatile, bool IsNonTemporal,
-                                      bool IsLastUse) const override;
-
-  bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-                  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                  bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
-
-  bool insertAcquire(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     Position Pos) const override;
-
-  bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
-};
-
-class SIGfx11CacheControl : public SIGfx10CacheControl {
-public:
-  SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
-
-  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsVolatile, bool IsNonTemporal,
-                                      bool IsLastUse) const override;
 };
 
-class SIGfx12CacheControl : public SIGfx11CacheControl {
+class SIGfx12CacheControl final : public SICacheControl {
 protected:
   // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
   // \returns Returns true if \p MI is modified, false otherwise.
   bool setTH(const MachineBasicBlock::iterator MI,
              AMDGPU::CPol::CPol Value) const;
+
   // Sets Scope policy to \p Value if CPol operand is present in instruction \p
   // MI. \returns Returns true if \p MI is modified, false otherwise.
   bool setScope(const MachineBasicBlock::iterator MI,
@@ -620,16 +515,16 @@ protected:
                       SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
 
 public:
-  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
-    // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
-    // the behavior is the same if assuming GFX12.0 in CU mode.
-    assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
+  SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
+    // GFX120x and GFX125x memory models greatly overlap, and in some cases
+    // the behavior is the same if assuming GFX120x in CU mode.
+    assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
   }
 
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
+                  AtomicOrdering Order, bool AtomicsOnly) const override;
 
   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -641,7 +536,7 @@ public:
 
   bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
 
-  virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+  bool handleCooperativeAtomic(MachineInstr &MI) const override;
 
   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -664,6 +559,8 @@ public:
                             SIAtomicAddrSpace AddrSpace) const override {
     return setAtomicScope(MI, Scope, AddrSpace);
   }
+
+  bool handleNonVolatile(MachineInstr &MI) const override;
 };
 
 class SIMemoryLegalizer final {
@@ -701,6 +598,9 @@ private:
   /// instructions are added/deleted or \p MI is modified, false otherwise.
   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                 MachineBasicBlock::iterator &MI);
+  /// Expands LDS DMA operation \p MI. Returns true if instructions are
+  /// added/deleted or \p MI is modified, false otherwise.
+  bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
 
 public:
   SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
@@ -775,7 +675,7 @@ getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
 
 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
                                       const char *Msg) const {
-  const Function &Func = MI->getParent()->getParent()->getFunction();
+  const Function &Func = MI->getMF()->getFunction();
   Func.getContext().diagnose(
       DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
 }
@@ -830,6 +730,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
     return SIAtomicAddrSpace::SCRATCH;
   if (AS == AMDGPUAS::REGION_ADDRESS)
     return SIAtomicAddrSpace::GDS;
+  if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+      AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
+    return SIAtomicAddrSpace::GLOBAL;
 
   return SIAtomicAddrSpace::OTHER;
 }
@@ -879,6 +782,13 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     }
   }
 
+  // FIXME: The MMO of buffer atomic instructions does not always have an atomic
+  // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
+  // here, but the lowering should really be cleaned up at some point.
+  if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
+      SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
+    Ordering = AtomicOrdering::Monotonic;
+
   SIAtomicScope Scope = SIAtomicScope::NONE;
   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsCrossAddressSpaceOrdering = false;
@@ -985,19 +895,41 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
   return constructFromMIWithMMO(MI);
 }
 
+std::optional<SIMemOpInfo>
+SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
+  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+  if (!SIInstrInfo::isLDSDMA(*MI))
+    return std::nullopt;
+
+  return constructFromMIWithMMO(MI);
+}
+
+/// \returns true if \p MI has one or more MMO, and all of them are fit for
+/// being marked as non-volatile. This means that either they are accessing the
+/// constant address space, are accessing a known invariant memory location, or
+/// that they are marked with the non-volatile metadata/MMO flag.
+static bool isNonVolatileMemoryAccess(const MachineInstr &MI) {
+  if (MI.getNumMemOperands() == 0)
+    return false;
+  return all_of(MI.memoperands(), [&](const MachineMemOperand *MMO) {
+    return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant);
+  });
+}
+
 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
   TII = ST.getInstrInfo();
   IV = getIsaVersion(ST.getCPU());
   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
 }
 
-bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
-                                    AMDGPU::CPol::CPol Bit) const {
+bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
+                                    unsigned Bits) const {
   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
   if (!CPol)
     return false;
 
-  CPol->setImm(CPol->getImm() | Bit);
+  CPol->setImm(CPol->getImm() | Bits);
   return true;
 }
 
@@ -1013,18 +945,10 @@ bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
 /* static */
 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
   GCNSubtarget::Generation Generation = ST.getGeneration();
-  if (ST.hasGFX940Insts())
-    return std::make_unique<SIGfx940CacheControl>(ST);
-  if (ST.hasGFX90AInsts())
-    return std::make_unique<SIGfx90ACacheControl>(ST);
-  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
-    return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX10)
-    return std::make_unique<SIGfx7CacheControl>(ST);
-  if (Generation < AMDGPUSubtarget::GFX11)
-    return std::make_unique<SIGfx10CacheControl>(ST);
+    return std::make_unique<SIGfx6CacheControl>(ST);
   if (Generation < AMDGPUSubtarget::GFX12)
-    return std::make_unique<SIGfx11CacheControl>(ST);
+    return std::make_unique<SIGfx10CacheControl>(ST);
   return std::make_unique<SIGfx12CacheControl>(ST);
 }
 
@@ -1033,33 +957,61 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
     SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
 
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
+  if (!canAffectGlobalAddrSpace(AddrSpace)) {
+    /// The scratch address space does not need the global memory caches
+    /// to be bypassed as all memory operations by the same thread are
+    /// sequentially consistent, and no other thread can access scratch
+    /// memory.
+
+    /// Other address spaces do not have a cache.
+    return false;
+  }
+
+  bool Changed = false;
+  switch (Scope) {
+  case SIAtomicScope::SYSTEM:
+    if (ST.hasGFX940Insts()) {
+      // Set SC bits to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+      break;
+    }
+    [[fallthrough]];
+  case SIAtomicScope::AGENT:
+    if (ST.hasGFX940Insts()) {
+      // Set SC bits to indicate agent scope.
+      Changed |= enableCPolBits(MI, CPol::SC1);
+    } else {
       // Set L1 cache policy to MISS_EVICT.
       // Note: there is no L2 cache bypass policy at the ISA level.
-      Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
+      Changed |= enableCPolBits(MI, CPol::GLC);
     }
+    break;
+  case SIAtomicScope::WORKGROUP:
+    if (ST.hasGFX940Insts()) {
+      // In threadgroup split mode the waves of a work-group can be executing
+      // on different CUs. Therefore need to bypass the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be bypassed. Setting
+      // SC bits to indicate work-group scope will do this automatically.
+      Changed |= enableCPolBits(MI, CPol::SC0);
+    } else if (ST.hasGFX90AInsts()) {
+      // In threadgroup split mode the waves of a work-group can be executing
+      // on different CUs. Therefore need to bypass the L1 which is per CU.
+      // Otherwise in non-threadgroup split mode all waves of a work-group are
+      // on the same CU, and so the L1 does not need to be bypassed.
+      if (ST.isTgSplitEnabled())
+        Changed |= enableCPolBits(MI, CPol::GLC);
+    }
+    break;
+  case SIAtomicScope::WAVEFRONT:
+  case SIAtomicScope::SINGLETHREAD:
+    // No cache to bypass.
+    break;
+  default:
+    llvm_unreachable("Unsupported synchronization scope");
   }
 
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
   return Changed;
 }
 
@@ -1070,8 +1022,39 @@ bool SIGfx6CacheControl::enableStoreCacheBypass(
   assert(!MI->mayLoad() && MI->mayStore());
   bool Changed = false;
 
-  /// The L1 cache is write through so does not need to be bypassed. There is no
-  /// bypass control for the L2 cache at the isa level.
+  /// For targets other than GFX940, the L1 cache is write through so does not
+  /// need to be bypassed. There is no bypass control for the L2 cache at the
+  /// isa level.
+
+  if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Set SC bits to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+      break;
+    case SIAtomicScope::AGENT:
+      // Set SC bits to indicate agent scope.
+      Changed |= enableCPolBits(MI, CPol::SC1);
+      break;
+    case SIAtomicScope::WORKGROUP:
+      // Set SC bits to indicate workgroup scope.
+      Changed |= enableCPolBits(MI, CPol::SC0);
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // Leave SC bits unset to indicate wavefront scope.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+
+    /// The scratch address space does not need the global memory caches
+    /// to be bypassed as all memory operations by the same thread are
+    /// sequentially consistent, and no other thread can access scratch
+    /// memory.
+
+    /// Other address spaces do not have a cache.
+  }
 
   return Changed;
 }
@@ -1083,10 +1066,31 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
   assert(MI->mayLoad() && MI->mayStore());
   bool Changed = false;
 
-  /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
-  /// bypassed, and the GLC bit is instead used to indicate if they are
-  /// return or no-return.
-  /// Note: there is no L2 cache coherent bypass control at the ISA level.
+  /// For targets other than GFX940, do not set GLC for RMW atomic operations as
+  /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
+  /// indicate if they are return or no-return. Note: there is no L2 cache
+  /// coherent bypass control at the ISA level.
+  ///       For GFX90A+, RMW atomics implicitly bypass the L1 cache.
+
+  if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+      // Set SC1 bit to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC1);
+      break;
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+      // to indicate system or agent scope. The SC0 bit is used to indicate if
+      // they are return or no-return. Leave SC1 bit unset to indicate agent
+      // scope.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
 
   return Changed;
 }
@@ -1097,7 +1101,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1108,11 +1112,15 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   bool Changed = false;
 
   if (IsVolatile) {
-    // Set L1 cache policy to be MISS_EVICT for load instructions
-    // and MISS_LRU for store instructions.
-    // Note: there is no L2 cache bypass policy at the ISA level.
-    if (Op == SIMemOp::LOAD)
-      Changed |= enableGLCBit(MI);
+    if (ST.hasGFX940Insts()) {
+      // Set SC bits to indicate system scope.
+      Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
+    } else if (Op == SIMemOp::LOAD) {
+      // Set L1 cache policy to be MISS_EVICT for load instructions
+      // and MISS_LRU for store instructions.
+      // Note: there is no L2 cache bypass policy at the ISA level.
+      Changed |= enableCPolBits(MI, CPol::GLC);
+    }
 
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
@@ -1120,16 +1128,20 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
 
     return Changed;
   }
 
   if (IsNonTemporal) {
-    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
-    // for both loads and stores, and the L2 cache policy to STREAM.
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    if (ST.hasGFX940Insts()) {
+      Changed |= enableCPolBits(MI, CPol::NT);
+    } else {
+      // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
+      // for both loads and stores, and the L2 cache policy to STREAM.
+      Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
+    }
     return Changed;
   }
 
@@ -1140,15 +1152,36 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                     SIAtomicScope Scope,
                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                     bool IsCrossAddrSpaceOrdering, Position Pos,
-                                    AtomicOrdering Order) const {
+                                    AtomicOrdering Order,
+                                    bool AtomicsOnly) const {
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   if (Pos == Position::AFTER)
     ++MI;
 
+  // GFX90A+
+  if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
+    // In threadgroup split mode the waves of a work-group can be executing on
+    // different CUs. Therefore need to wait for global or GDS memory operations
+    // to complete to ensure they are visible to waves in the other CUs.
+    // Otherwise in non-threadgroup split mode all waves of a work-group are on
+    // the same CU, so no need to wait for global memory as all waves in the
+    // work-group access the same the L1, nor wait for GDS as access are ordered
+    // on a CU.
+    if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
+                       SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
+        (Scope == SIAtomicScope::WORKGROUP)) {
+      // Same as <GFX90A at AGENT scope;
+      Scope = SIAtomicScope::AGENT;
+    }
+    // In threadgroup split mode LDS cannot be allocated so no need to wait for
+    // LDS memory operations.
+    AddrSpace &= ~SIAtomicAddrSpace::LDS;
+  }
+
   bool VMCnt = false;
   bool LGKMCnt = false;
 
@@ -1243,61 +1276,13 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
-bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
-                                       SIAtomicScope Scope,
-                                       SIAtomicAddrSpace AddrSpace,
-                                       Position Pos) const {
-  if (!InsertCacheInv)
+static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
     return false;
-
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to invalidate.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  if (Pos == Position::AFTER)
-    --MI;
-
-  return Changed;
-}
-
-bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
-                                       SIAtomicScope Scope,
-                                       SIAtomicAddrSpace AddrSpace,
-                                       bool IsCrossAddrSpaceOrdering,
-                                       Position Pos) const {
-  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                    IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+  return !ST.isAmdPalOS() && !ST.isMesa3DOS();
 }
 
-bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
                                        SIAtomicScope Scope,
                                        SIAtomicAddrSpace AddrSpace,
                                        Position Pos) const {
@@ -1307,235 +1292,97 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
-
-  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
-                                    ? AMDGPU::BUFFER_WBINVL1
-                                    : AMDGPU::BUFFER_WBINVL1_VOL;
+  const DebugLoc &DL = MI->getDebugLoc();
 
   if (Pos == Position::AFTER)
     ++MI;
 
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to invalidate.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  if (Pos == Position::AFTER)
-    --MI;
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
+  const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
+                                    ? AMDGPU::BUFFER_WBINVL1_VOL
+                                    : AMDGPU::BUFFER_WBINVL1;
 
   if (canAffectGlobalAddrSpace(AddrSpace)) {
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      // Set the L1 cache policy to MISS_LRU.
-      // Note: there is no L2 cache bypass policy at the ISA level.
-      Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to bypass the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be bypassed.
-      if (ST.isTgSplitEnabled())
-        Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::enableRMWCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && MI->mayStore());
-  bool Changed = false;
+      if (ST.hasGFX940Insts()) {
+        // Ensures that following loads will not see stale remote VMEM data or
+        // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+        // and CC will never be stale due to the local memory probes.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+            // Set SC bits to indicate system scope.
+            .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+        // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+        // hardware does not reorder memory operations by the same wave with
+        // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+        // remove any cache lines of earlier writes by the same wave and ensures
+        // later reads by the same wave will refetch the cache lines.
+        Changed = true;
+        break;
+      }
 
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
+      if (ST.hasGFX90AInsts()) {
+        // Ensures that following loads will not see stale remote VMEM data or
+        // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
+        // and CC will never be stale due to the local memory probes.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
+        BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+        // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+        // hardware does not reorder memory operations by the same wave with
+        // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
+        // to remove any cache lines of earlier writes by the same wave and
+        // ensures later reads by the same wave will refetch the cache lines.
+        Changed = true;
+        break;
+      }
+      [[fallthrough]];
     case SIAtomicScope::AGENT:
-      /// Do not set glc for RMW atomic operations as they implicitly bypass
-      /// the L1 cache, and the glc bit is instead used to indicate if they are
-      /// return or no-return.
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
-    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-  // Only handle load and store, not atomic read-modify-write insructions. The
-  // latter use glc to indicate if the atomic returns a result and so must not
-  // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
-
-  // Only update load and store, not LLVM IR atomic read-modify-write
-  // instructions. The latter are always marked as volatile so cannot sensibly
-  // handle it as do not want to pessimize all atomics. Also they do not support
-  // the nontemporal attribute.
-  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
-  bool Changed = false;
-
-  if (IsVolatile) {
-    // Set L1 cache policy to be MISS_EVICT for load instructions
-    // and MISS_LRU for store instructions.
-    // Note: there is no L2 cache bypass policy at the ISA level.
-    if (Op == SIMemOp::LOAD)
-      Changed |= enableGLCBit(MI);
-
-    // Ensure operation has completed at system scope to cause all volatile
-    // operations to be visible outside the program in a global order. Do not
-    // request cross address space as only the global address space can be
-    // observable outside the program, so no need to cause a waitcnt for LDS
-    // address space operations.
-    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
-
-    return Changed;
-  }
-
-  if (IsNonTemporal) {
-    // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
-    // for both loads and stores, and the L2 cache policy to STREAM.
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
-    return Changed;
-  }
-
-  return Changed;
-}
-
-bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
-                                      SIAtomicScope Scope,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsCrossAddrSpaceOrdering,
-                                      Position Pos,
-                                      AtomicOrdering Order) const {
-  if (ST.isTgSplitEnabled()) {
-    // In threadgroup split mode the waves of a work-group can be executing on
-    // different CUs. Therefore need to wait for global or GDS memory operations
-    // to complete to ensure they are visible to waves in the other CUs.
-    // Otherwise in non-threadgroup split mode all waves of a work-group are on
-    // the same CU, so no need to wait for global memory as all waves in the
-    // work-group access the same the L1, nor wait for GDS as access are ordered
-    // on a CU.
-    if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
-                       SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
-        (Scope == SIAtomicScope::WORKGROUP)) {
-      // Same as GFX7 using agent scope.
-      Scope = SIAtomicScope::AGENT;
-    }
-    // In threadgroup split mode LDS cannot be allocated so no need to wait for
-    // LDS memory operations.
-    AddrSpace &= ~SIAtomicAddrSpace::LDS;
-  }
-  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
-                                        IsCrossAddrSpaceOrdering, Pos, Order);
-}
-
-bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         Position Pos) const {
-  if (!InsertCacheInv)
-    return false;
-
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Ensures that following loads will not see stale remote VMEM data or
-      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
-      // CC will never be stale due to the local memory probes.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
-      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
-      // remove any cache lines of earlier writes by the same wave and ensures
-      // later reads by the same wave will refetch the cache lines.
+      if (ST.hasGFX940Insts()) {
+        // Ensures that following loads will not see stale remote date or local
+        // MTYPE NC global data. Local MTYPE RW and CC memory will never be
+        // stale due to the memory probes.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+            // Set SC bits to indicate agent scope.
+            .addImm(AMDGPU::CPol::SC1);
+        // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+        // does not reorder memory operations with respect to preceeding buffer
+        // invalidate. The invalidate is guaranteed to remove any cache lines of
+        // earlier writes and ensures later writes will refetch the cache lines.
+      } else
+        BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
       Changed = true;
       break;
-    case SIAtomicScope::AGENT:
-      // Same as GFX7.
-      break;
     case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to invalidate the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be invalidated.
       if (ST.isTgSplitEnabled()) {
-        // Same as GFX7 using agent scope.
-        Scope = SIAtomicScope::AGENT;
+        if (ST.hasGFX940Insts()) {
+          // In threadgroup split mode the waves of a work-group can be
+          // executing on different CUs. Therefore need to invalidate the L1
+          // which is per CU. Otherwise in non-threadgroup split mode all waves
+          // of a work-group are on the same CU, and so the L1 does not need to
+          // be invalidated.
+
+          // Ensures L1 is invalidated if in threadgroup split mode. In
+          // non-threadgroup split mode it is a NOP, but no point generating it
+          // in that case if know not in that mode.
+          BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+              // Set SC bits to indicate work-group scope.
+              .addImm(AMDGPU::CPol::SC0);
+          // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+          // does not reorder memory operations with respect to preceeding
+          // buffer invalidate. The invalidate is guaranteed to remove any cache
+          // lines of earlier writes and ensures later writes will refetch the
+          // cache lines.
+          Changed = true;
+        } else if (ST.hasGFX90AInsts()) {
+          BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
+          Changed = true;
+        }
       }
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
-      // Same as GFX7.
+      // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
+      // there are no caches to invalidate. All other targets have no cache to
+      // invalidate.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
@@ -1552,366 +1399,76 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   if (Pos == Position::AFTER)
     --MI;
 
-  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
-
   return Changed;
 }
 
-bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         bool IsCrossAddrSpaceOrdering,
-                                         Position Pos) const {
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  const DebugLoc &DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
-      // to initiate writeback of any dirty cache lines of earlier writes by the
-      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
-      // writeback has completed.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
-        // Set SC bits to indicate system scope.
-        .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
-      // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
-      // vmcnt(0)" needed by the "BUFFER_WBL2".
-      Changed = true;
-      break;
-    case SIAtomicScope::AGENT:
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Same as GFX7.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  if (Pos == Position::AFTER)
-    --MI;
-
-  Changed |=
-      SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
-                                        IsCrossAddrSpaceOrdering, Pos);
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Set SC bits to indicate system scope.
-      Changed |= enableSC0Bit(MI);
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::AGENT:
-      // Set SC bits to indicate agent scope.
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to bypass the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be bypassed. Setting SC
-      // bits to indicate work-group scope will do this automatically.
-      Changed |= enableSC0Bit(MI);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Leave SC bits unset to indicate wavefront scope.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableStoreCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
-  assert(!MI->mayLoad() && MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Set SC bits to indicate system scope.
-      Changed |= enableSC0Bit(MI);
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::AGENT:
-      // Set SC bits to indicate agent scope.
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // Set SC bits to indicate workgroup scope.
-      Changed |= enableSC0Bit(MI);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Leave SC bits unset to indicate wavefront scope.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableRMWCacheBypass(
-    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Set SC1 bit to indicate system scope.
-      Changed |= enableSC1Bit(MI);
-      break;
-    case SIAtomicScope::AGENT:
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // RMW atomic operations implicitly bypass the L1 cache and only use SC1
-      // to indicate system or agent scope. The SC0 bit is used to indicate if
-      // they are return or no-return. Leave SC1 bit unset to indicate agent
-      // scope.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
-    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-  // Only handle load and store, not atomic read-modify-write insructions. The
-  // latter use glc to indicate if the atomic returns a result and so must not
-  // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
-
-  // Only update load and store, not LLVM IR atomic read-modify-write
-  // instructions. The latter are always marked as volatile so cannot sensibly
-  // handle it as do not want to pessimize all atomics. Also they do not support
-  // the nontemporal attribute.
-  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
-  bool Changed = false;
-
-  if (IsVolatile) {
-    // Set SC bits to indicate system scope.
-    Changed |= enableSC0Bit(MI);
-    Changed |= enableSC1Bit(MI);
-
-    // Ensure operation has completed at system scope to cause all volatile
-    // operations to be visible outside the program in a global order. Do not
-    // request cross address space as only the global address space can be
-    // observable outside the program, so no need to cause a waitcnt for LDS
-    // address space operations.
-    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
-
-    return Changed;
-  }
-
-  if (IsNonTemporal) {
-    Changed |= enableNTBit(MI);
-    return Changed;
-  }
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         Position Pos) const {
-  if (!InsertCacheInv)
-    return false;
-
+bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+                                       SIAtomicScope Scope,
+                                       SIAtomicAddrSpace AddrSpace,
+                                       bool IsCrossAddrSpaceOrdering,
+                                       Position Pos) const {
   bool Changed = false;
 
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  if (ST.hasGFX90AInsts()) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    const DebugLoc &DL = MI->getDebugLoc();
 
-  if (Pos == Position::AFTER)
-    ++MI;
+    if (Pos == Position::AFTER)
+      ++MI;
 
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Ensures that following loads will not see stale remote VMEM data or
-      // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
-      // CC will never be stale due to the local memory probes.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
-          // Set SC bits to indicate system scope.
-          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
-      // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
-      // remove any cache lines of earlier writes by the same wave and ensures
-      // later reads by the same wave will refetch the cache lines.
-      Changed = true;
-      break;
-    case SIAtomicScope::AGENT:
-      // Ensures that following loads will not see stale remote date or local
-      // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
-      // due to the memory probes.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
-          // Set SC bits to indicate agent scope.
-          .addImm(AMDGPU::CPol::SC1);
-      // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
-      // does not reorder memory operations with respect to preceeding buffer
-      // invalidate. The invalidate is guaranteed to remove any cache lines of
-      // earlier writes and ensures later writes will refetch the cache lines.
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In threadgroup split mode the waves of a work-group can be executing on
-      // different CUs. Therefore need to invalidate the L1 which is per CU.
-      // Otherwise in non-threadgroup split mode all waves of a work-group are
-      // on the same CU, and so the L1 does not need to be invalidated.
-      if (ST.isTgSplitEnabled()) {
-        // Ensures L1 is invalidated if in threadgroup split mode. In
-        // non-threadgroup split mode it is a NOP, but no point generating it in
-        // that case if know not in that mode.
-        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
-            // Set SC bits to indicate work-group scope.
-            .addImm(AMDGPU::CPol::SC0);
-        // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
-        // does not reorder memory operations with respect to preceeding buffer
-        // invalidate. The invalidate is guaranteed to remove any cache lines of
-        // earlier writes and ensures later writes will refetch the cache lines.
+    if (canAffectGlobalAddrSpace(AddrSpace)) {
+      switch (Scope) {
+      case SIAtomicScope::SYSTEM:
+        // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+        // hardware does not reorder memory operations by the same wave with
+        // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+        // to initiate writeback of any dirty cache lines of earlier writes by
+        // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+        // writeback has completed.
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+            // Set SC bits to indicate system scope.
+            .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
         Changed = true;
+        break;
+      case SIAtomicScope::AGENT:
+        if (ST.hasGFX940Insts()) {
+          BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+              // Set SC bits to indicate agent scope.
+              .addImm(AMDGPU::CPol::SC1);
+
+          // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+          // SIAtomicScope::AGENT, the following insertWait will generate the
+          // required "S_WAITCNT vmcnt(0)".
+          Changed = true;
+        }
+        break;
+      case SIAtomicScope::WORKGROUP:
+      case SIAtomicScope::WAVEFRONT:
+      case SIAtomicScope::SINGLETHREAD:
+        // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
+        // would writeback, and would require an otherwise unnecessary
+        // "S_WAITCNT vmcnt(0)".
+        break;
+      default:
+        llvm_unreachable("Unsupported synchronization scope");
       }
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Could generate "BUFFER_INV" but it would do nothing as there are no
-      // caches to invalidate.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
     }
-  }
-
-  /// The scratch address space does not need the global memory cache
-  /// to be flushed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
 
-  if (Pos == Position::AFTER)
-    --MI;
-
-  return Changed;
-}
-
-bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
-                                         SIAtomicScope Scope,
-                                         SIAtomicAddrSpace AddrSpace,
-                                         bool IsCrossAddrSpaceOrdering,
-                                         Position Pos) const {
-  bool Changed = false;
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-
-  if (Pos == Position::AFTER)
-    ++MI;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-      // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
-      // hardware does not reorder memory operations by the same wave with
-      // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
-      // to initiate writeback of any dirty cache lines of earlier writes by the
-      // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
-      // writeback has completed.
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
-          // Set SC bits to indicate system scope.
-          .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
-      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
-      // SIAtomicScope::SYSTEM, the following insertWait will generate the
-      // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
-      Changed = true;
-      break;
-    case SIAtomicScope::AGENT:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
-          // Set SC bits to indicate agent scope.
-          .addImm(AMDGPU::CPol::SC1);
-
-      // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
-      // SIAtomicScope::AGENT, the following insertWait will generate the
-      // required "S_WAITCNT vmcnt(0)".
-      Changed = true;
-      break;
-    case SIAtomicScope::WORKGROUP:
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // Do not generate "BUFFER_WBL2" as there are no caches it would
-      // writeback, and would require an otherwise unnecessary
-      // "S_WAITCNT vmcnt(0)".
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
+    if (Pos == Position::AFTER)
+      --MI;
   }
 
-  if (Pos == Position::AFTER)
-    --MI;
-
   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
   // S_WAITCNT needed.
   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+                        /*AtomicsOnly=*/false);
 
   return Changed;
 }
 
 bool SIGfx10CacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope,
+    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
   assert(MI->mayLoad() && !MI->mayStore());
   bool Changed = false;
@@ -1922,8 +1479,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
     case SIAtomicScope::AGENT:
       // Set the L0 and L1 cache policies to MISS_EVICT.
       // Note: there is no L2 cache coherent bypass control at the ISA level.
-      Changed |= enableGLCBit(MI);
-      Changed |= enableDLCBit(MI);
+      // For GFX10, set GLC+DLC, for GFX11, only set GLC.
+      Changed |=
+          enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
       break;
     case SIAtomicScope::WORKGROUP:
       // In WGP mode the waves of a work-group can be executing on either CU of
@@ -1931,7 +1489,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
       // CU mode all waves of a work-group are on the same CU, and so the L0
       // does not need to be bypassed.
       if (!ST.isCuModeEnabled())
-        Changed |= enableGLCBit(MI);
+        Changed |= enableCPolBits(MI, CPol::GLC);
       break;
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
@@ -1959,7 +1517,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1974,17 +1532,21 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
     // and MISS_LRU for store instructions.
     // Note: there is no L2 cache coherent bypass control at the ISA level.
     if (Op == SIMemOp::LOAD) {
-      Changed |= enableGLCBit(MI);
-      Changed |= enableDLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
     }
 
+    // GFX11: Set MALL NOALLOC for both load and store instructions.
+    if (AMDGPU::isGFX11(ST))
+      Changed |= enableCPolBits(MI, CPol::DLC);
+
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
     return Changed;
   }
 
@@ -1994,8 +1556,12 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
     // For stores setting both GLC and SLC configures L0 and L1 cache policy
     // to MISS_EVICT and the L2 cache policy to STREAM.
     if (Op == SIMemOp::STORE)
-      Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+      Changed |= enableCPolBits(MI, CPol::GLC);
+    Changed |= enableCPolBits(MI, CPol::SLC);
+
+    // GFX11: Set MALL NOALLOC for both load and store instructions.
+    if (AMDGPU::isGFX11(ST))
+      Changed |= enableCPolBits(MI, CPol::DLC);
 
     return Changed;
   }
@@ -2007,11 +1573,12 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
-                                     Position Pos, AtomicOrdering Order) const {
+                                     Position Pos, AtomicOrdering Order,
+                                     bool AtomicsOnly) const {
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   if (Pos == Position::AFTER)
     ++MI;
@@ -2035,8 +1602,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       // the WGP. Therefore need to wait for operations to complete to ensure
       // they are visible to waves in the other CU as the L0 is per CU.
       // Otherwise in CU mode and all waves of a work-group are on the same CU
-      // which shares the same L0.
-      if (!ST.isCuModeEnabled()) {
+      // which shares the same L0. Note that we still need to wait when
+      // performing a release in this mode to respect the transitivity of
+      // happens-before, e.g. other waves of the workgroup must be able to
+      // release the memory from another wave at a wider scope.
+      if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           VMCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2143,7 +1713,7 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   if (Pos == Position::AFTER)
     ++MI;
@@ -2191,117 +1761,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
-bool SIGfx10CacheControl::insertBarrierStart(
-    MachineBasicBlock::iterator &MI) const {
-  // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
-  // mode. This is because a CU mode release fence does not emit any wait, which
-  // is fine when only dealing with vmem, but isn't sufficient in the presence
-  // of barriers which do not go through vmem.
-  // GFX12.5 does not require this additional wait.
-  if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
-    return false;
-
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
-  return true;
-}
-
-bool SIGfx11CacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      // Set the L0 and L1 cache policies to MISS_EVICT.
-      // Note: there is no L2 cache coherent bypass control at the ISA level.
-      Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In WGP mode the waves of a work-group can be executing on either CU of
-      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
-      // CU mode all waves of a work-group are on the same CU, and so the L0
-      // does not need to be bypassed.
-      if (!ST.isCuModeEnabled())
-        Changed |= enableGLCBit(MI);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
-    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-
-  // Only handle load and store, not atomic read-modify-write insructions. The
-  // latter use glc to indicate if the atomic returns a result and so must not
-  // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
-
-  // Only update load and store, not LLVM IR atomic read-modify-write
-  // instructions. The latter are always marked as volatile so cannot sensibly
-  // handle it as do not want to pessimize all atomics. Also they do not support
-  // the nontemporal attribute.
-  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
-  bool Changed = false;
-
-  if (IsVolatile) {
-    // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
-    // and MISS_LRU for store instructions.
-    // Note: there is no L2 cache coherent bypass control at the ISA level.
-    if (Op == SIMemOp::LOAD)
-      Changed |= enableGLCBit(MI);
-
-    // Set MALL NOALLOC for load and store instructions.
-    Changed |= enableDLCBit(MI);
-
-    // Ensure operation has completed at system scope to cause all volatile
-    // operations to be visible outside the program in a global order. Do not
-    // request cross address space as only the global address space can be
-    // observable outside the program, so no need to cause a waitcnt for LDS
-    // address space operations.
-    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
-    return Changed;
-  }
-
-  if (IsNonTemporal) {
-    // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
-    // and L2 cache policy to STREAM.
-    // For stores setting both GLC and SLC configures L0 and L1 cache policy
-    // to MISS_EVICT and the L2 cache policy to STREAM.
-    if (Op == SIMemOp::STORE)
-      Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
-
-    // Set MALL NOALLOC for load and store instructions.
-    Changed |= enableDLCBit(MI);
-    return Changed;
-  }
-
-  return Changed;
-}
-
 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
                                 AMDGPU::CPol::CPol Value) const {
   MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
@@ -2354,11 +1813,12 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
-                                     Position Pos, AtomicOrdering Order) const {
+                                     Position Pos, AtomicOrdering Order,
+                                     bool AtomicsOnly) const {
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   bool LOADCnt = false;
   bool DSCnt = false;
@@ -2383,15 +1843,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       //   In WGP mode the waves of a work-group can be executing on either CU
       //   of the WGP. Therefore need to wait for operations to complete to
       //   ensure they are visible to waves in the other CU as the L0 is per CU.
+      //
       //   Otherwise in CU mode and all waves of a work-group are on the same CU
-      //   which shares the same L0.
+      //   which shares the same L0. Note that we still need to wait when
+      //   performing a release in this mode to respect the transitivity of
+      //   happens-before, e.g. other waves of the workgroup must be able to
+      //   release the memory from another wave at a wider scope.
       //
       // GFX12.5:
       //   CU$ has two ports. To ensure operations are visible at the workgroup
       //   level, we need to ensure all operations in this port have completed
       //   so the other SIMDs in the WG can see them. There is no ordering
       //   guarantee between the ports.
-      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
+      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
+          isReleaseOrStronger(Order)) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           LOADCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2444,7 +1909,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     //
     // This also applies to fences. Fences cannot pair with an instruction
     // tracked with bvh/samplecnt as we don't have any atomics that do that.
-    if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
+    if (!AtomicsOnly && ST.hasImageInsts()) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
@@ -2476,7 +1941,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
     return false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   /// The scratch address space does not need the global memory cache
   /// to be flushed as all memory operations by the same thread are
@@ -2527,6 +1992,17 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   if (Pos == Position::AFTER)
     --MI;
 
+  // Target requires a waitcnt to ensure that the proceeding INV has completed
+  // as it may get reorded with following load instructions.
+  if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
+    insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD,
+               /*IsCrossAddrSpaceOrdering=*/false, Pos, AtomicOrdering::Acquire,
+               /*AtomicsOnly=*/false);
+
+    if (Pos == Position::AFTER)
+      --MI;
+  }
+
   return true;
 }
 
@@ -2538,7 +2014,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
 
   // The scratch address space does not need the global memory cache
   // writeback as all memory operations by the same thread are
@@ -2554,19 +2030,15 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
     //
     // Emitting it for lower scopes is a slow no-op, so we omit it
     // for performance.
+    std::optional<AMDGPU::CPol::CPol> NeedsWB;
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
-          .addImm(AMDGPU::CPol::SCOPE_SYS);
-      Changed = true;
+      NeedsWB = AMDGPU::CPol::SCOPE_SYS;
       break;
     case SIAtomicScope::AGENT:
       // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
-      if (ST.hasGFX1250Insts()) {
-        BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
-            .addImm(AMDGPU::CPol::SCOPE_DEV);
-        Changed = true;
-      }
+      if (ST.hasGFX1250Insts())
+        NeedsWB = AMDGPU::CPol::SCOPE_DEV;
       break;
     case SIAtomicScope::CLUSTER:
     case SIAtomicScope::WORKGROUP:
@@ -2579,6 +2051,20 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
       llvm_unreachable("Unsupported synchronization scope");
     }
 
+    if (NeedsWB) {
+      // Target requires a waitcnt to ensure that the proceeding store
+      // proceeding store/rmw operations have completed in L2 so their data will
+      // be written back by the WB instruction.
+      if (ST.hasINVWBL2WaitCntRequirement())
+        insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                   /*IsCrossAddrSpaceOrdering=*/false, Pos,
+                   AtomicOrdering::Release,
+                   /*AtomicsOnly=*/false);
+
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB);
+      Changed = true;
+    }
+
     if (Pos == Position::AFTER)
       --MI;
   }
@@ -2587,17 +2073,29 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
   // we of course need to wait for that as well.
   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+                        /*AtomicsOnly=*/false);
 
   return Changed;
 }
 
+bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
+  // On GFX12.5, set the NV CPol bit.
+  if (!ST.hasGFX1250Insts())
+    return false;
+  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+  if (!CPol)
+    return false;
+  CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV);
+  return true;
+}
+
 bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
 
   // Only handle load and store, not atomic read-modify-write instructions.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -2618,13 +2116,21 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   if (IsVolatile) {
     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
 
+    if (ST.requiresWaitXCntForSingleAccessInstructions() &&
+        SIInstrInfo::isVMEM(*MI)) {
+      MachineBasicBlock &MBB = *MI->getParent();
+      BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+      Changed = true;
+    }
+
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
   }
 
   return Changed;
@@ -2635,9 +2141,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
   const bool IsRMW = (MI.mayLoad() && MI.mayStore());
   bool Changed = false;
 
-  // GFX12.5 only: xcnt wait is needed before flat and global atomics
-  // stores/rmw.
-  if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+  if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
+      SIInstrInfo::isVMEM(MI)) {
     MachineBasicBlock &MBB = *MI.getParent();
     BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
     Changed = true;
@@ -2653,7 +2158,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
   const unsigned Scope = CPol->getImm() & CPol::SCOPE;
 
   // GFX12.0 only: Extra waits needed before system scope stores.
-  if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
+  if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
+      Scope == CPol::SCOPE_SYS)
     Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
 
   return Changed;
@@ -2748,13 +2254,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
                                 SIMemOp::LOAD | SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE, Order);
+                                Position::BEFORE, Order, /*AtomicsOnly=*/false);
 
     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::SequentiallyConsistent) {
-      Changed |= CC->insertWait(
-          MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
-          MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+      // The wait below only needs to wait on the prior atomic.
+      Changed |=
+          CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+                         SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
+                         Position::AFTER, Order, /*AtomicsOnly=*/true);
       Changed |= CC->insertAcquire(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    Position::AFTER);
@@ -2830,9 +2338,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Acquire) {
-      Changed |= CC->insertWait(
-          MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-          MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
+      // Acquire fences only need to wait on the previous atomic they pair with.
+      Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE, Order, /*AtomicsOnly=*/true);
     }
 
     if (Order == AtomicOrdering::Release ||
@@ -2897,10 +2407,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
         Order == AtomicOrdering::SequentiallyConsistent ||
         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
-      Changed |= CC->insertWait(
-          MI, MOI.getScope(), MOI.getInstrAddrSpace(),
-          isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
-          MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+      // Only wait on the previous atomic.
+      Changed |=
+          CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+                         isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
+                         MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
+                         Order, /*AtomicsOnly=*/true);
       Changed |= CC->insertAcquire(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    Position::AFTER);
@@ -2913,6 +2425,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   return Changed;
 }
 
+bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
+                                     MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoad() && MI->mayStore());
+
+  // The volatility or nontemporal-ness of the operation is a
+  // function of the global memory, not the LDS.
+  SIMemOp OpKind =
+      SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
+
+  // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
+  // stores. The operation is treated as a volatile/nontemporal store
+  // to its second argument.
+  return CC->enableVolatileAndOrNonTemporal(
+      MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
+      MOI.isNonTemporal(), MOI.isLastUse());
+}
+
 bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
   const MachineModuleInfo &MMI =
       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -2956,22 +2485,21 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         MI = II->getIterator();
       }
 
-      if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
-        Changed |= CC->insertBarrierStart(MI);
-        continue;
+      if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
+        if (const auto &MOI = MOA.getLoadInfo(MI))
+          Changed |= expandLoad(*MOI, MI);
+        else if (const auto &MOI = MOA.getStoreInfo(MI))
+          Changed |= expandStore(*MOI, MI);
+        else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
+          Changed |= expandLDSDMA(*MOI, MI);
+        else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
+          Changed |= expandAtomicFence(*MOI, MI);
+        else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+          Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
       }
 
-      if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
-        continue;
-
-      if (const auto &MOI = MOA.getLoadInfo(MI))
-        Changed |= expandLoad(*MOI, MI);
-      else if (const auto &MOI = MOA.getStoreInfo(MI)) {
-        Changed |= expandStore(*MOI, MI);
-      } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
-        Changed |= expandAtomicFence(*MOI, MI);
-      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
-        Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
+      if (isNonVolatileMemoryAccess(*MI))
+        Changed |= CC->handleNonVolatile(*MI);
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
index f9efee6..9a58382 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -28,19 +28,9 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
       DX10Clamp = DX10ClampAttr == "true";
   }
 
-  StringRef DenormF32Attr =
-      F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
-  if (!DenormF32Attr.empty())
-    FP32Denormals = parseDenormalFPAttribute(DenormF32Attr);
-
-  StringRef DenormAttr =
-      F.getFnAttribute("denormal-fp-math").getValueAsString();
-  if (!DenormAttr.empty()) {
-    DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr);
-    if (DenormF32Attr.empty())
-      FP32Denormals = DenormMode;
-    FP64FP16Denormals = DenormMode;
-  }
+  DenormalFPEnv FPEnv = F.getDenormalFPEnv();
+  FP64FP16Denormals = FPEnv.DefaultMode;
+  FP32Denormals = FPEnv.F32Mode;
 }
 
 using namespace AMDGPU;
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index aa028c8..47bc218 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -76,9 +76,7 @@ class SIOptimizeExecMaskingLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) {
-    initializeSIOptimizeExecMaskingLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  SIOptimizeExecMaskingLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index c186f5a..ac24f2f 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -54,10 +54,7 @@ class SIOptimizeExecMaskingPreRALegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SIOptimizeExecMaskingPreRALegacy() : MachineFunctionPass(ID) {
-    initializeSIOptimizeExecMaskingPreRALegacyPass(
-        *PassRegistry::getPassRegistry());
-  }
+  SIOptimizeExecMaskingPreRALegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -473,6 +470,8 @@ bool SIOptimizeExecMaskingPreRA::run(MachineFunction &MF) {
         assert(Idx != -1);
         if (SingleExecUser->getParent() == I->getParent() &&
             !SingleExecUser->getOperand(Idx).isImplicit() &&
+            static_cast<unsigned>(Idx) <
+                SingleExecUser->getDesc().getNumOperands() &&
             TII->isOperandLegal(*SingleExecUser, Idx, &I->getOperand(1))) {
           LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
           LIS->RemoveMachineInstrFromMaps(*I);
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 41402bd..610a835 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -81,6 +81,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bfac639..926c52f 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -118,7 +118,7 @@ public:
   MachineInstr *getParentInst() const { return Target->getParent(); }
 
   MachineRegisterInfo *getMRI() const {
-    return &getParentInst()->getParent()->getParent()->getRegInfo();
+    return &getParentInst()->getMF()->getRegInfo();
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1284,7 +1284,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     // Clone the instruction to allow revoking changes
     // made to MI during the processing of the operands
     // if the conversion fails.
-    SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
+    SDWAInst = MI.getMF()->CloneMachineInstr(&MI);
     MI.getParent()->insert(MI.getIterator(), SDWAInst);
   } else {
     SDWAInst = createSDWAVersion(MI);
@@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
   unsigned ConstantBusCount = 0;
   for (MachineOperand &Op : MI.explicit_uses()) {
-    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
-      continue;
-
-    unsigned I = Op.getOperandNo();
+    if (Op.isReg()) {
+      if (TRI->isVGPR(*MRI, Op.getReg()))
+        continue;
 
-    int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]);
-    if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass)))
+      if (ST.hasSDWAScalar() && ConstantBusCount == 0) {
+        ++ConstantBusCount;
+        continue;
+      }
+    } else if (!Op.isImm())
       continue;
 
-    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
-        TRI->isSGPRReg(*MRI, Op.getReg())) {
-      ++ConstantBusCount;
+    unsigned I = Op.getOperandNo();
+    const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I);
+    if (!OpRC || !TRI->isVSSuperClass(OpRC))
       continue;
-    }
 
     Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
@@ -1355,8 +1356,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
     if (Op.isImm())
       Copy.addImm(Op.getImm());
     else if (Op.isReg())
-      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
-                  Op.getSubReg());
+      Copy.addReg(Op.getReg(), getKillRegState(Op.isKill()), Op.getSubReg());
     Op.ChangeToRegister(VGPR, false);
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 5720b97..787f7b3 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -29,9 +29,7 @@ public:
   static char ID;
 
 public:
-  SIPostRABundlerLegacy() : MachineFunctionPass(ID) {
-    initializeSIPostRABundlerLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  SIPostRABundlerLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -110,7 +108,7 @@ void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI,
            "subregister indexes should not be present after RA");
 
     for (MCRegUnit Unit : TRI->regunits(Reg))
-      UsedRegUnits.set(Unit);
+      UsedRegUnits.set(static_cast<unsigned>(Unit));
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index ecfaa5c..b9f2993 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -153,11 +153,13 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   for (unsigned Reg : RegsToRewrite) {
-    LIS->removeInterval(Reg);
-
     const Register PhysReg = VRM->getPhys(Reg);
     assert(PhysReg != 0);
 
+    LiveInterval &LI = LIS->getInterval(Reg);
+    Matrix->unassign(LI, /*ClearAllReferencingSegments=*/true);
+    LIS->removeInterval(Reg);
+
     MFI->reserveWWMRegister(PhysReg);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 01a40c1..73aab4e 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -22,10 +22,11 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/BranchProbability.h"
-
 using namespace llvm;
 
 #define DEBUG_TYPE "si-pre-emit-peephole"
@@ -47,9 +48,6 @@ private:
                              const MachineBasicBlock &From,
                              const MachineBasicBlock &To) const;
   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-  // Check if the machine instruction being processed is a supported packed
-  // instruction.
-  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   // Creates a list of packed instructions following an MFMA that are suitable
   // for unpacking.
   void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -68,11 +66,11 @@ private:
   // this transformation.
   void performF32Unpacking(MachineInstr &I);
   // Select corresponding unpacked instruction
-  uint16_t mapToUnpackedOpcode(MachineInstr &I);
+  uint32_t mapToUnpackedOpcode(MachineInstr &I);
   // Creates the unpacked instruction to be inserted. Adds source modifiers to
   // the unpacked instructions based on the source modifiers in the packed
   // instruction.
-  MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
+  MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint32_t UnpackedOpcode,
                                        bool IsHiBits);
   // Process operands/source modifiers from packed instructions and insert the
   // appropriate source modifers and operands into the unpacked instructions.
@@ -87,9 +85,7 @@ class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
-    initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry());
-  }
+  SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     return SIPreEmitPeephole().run(MF);
@@ -156,11 +152,12 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
 
   MachineOperand &Op1 = A->getOperand(1);
   MachineOperand &Op2 = A->getOperand(2);
-  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+  if ((!Op1.isReg() || Op1.getReg() != ExecReg) && Op2.isReg() &&
+      Op2.getReg() == ExecReg) {
     TII->commuteInstruction(*A);
     Changed = true;
   }
-  if (Op1.getReg() != ExecReg)
+  if (!Op1.isReg() || Op1.getReg() != ExecReg)
     return Changed;
   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
     return Changed;
@@ -299,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
                                          E = MI.getIterator();
        I != E; ++I) {
-    if (I->isBundle())
+    if (I->isBundle() || I->isDebugInstr())
       continue;
     switch (I->getOpcode()) {
     case AMDGPU::S_SET_GPR_IDX_MODE:
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
   return true;
 }
 
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  if (!TII->isNeverCoissue(MI))
-    return false;
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_FMA_F32:
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("Fully covered switch");
-}
-
 bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
   unsigned OpCode = MI.getOpcode();
   Register DstReg = MI.getOperand(0).getReg();
@@ -528,7 +508,7 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
   return false;
 }
 
-uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+uint32_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
   unsigned Opcode = I.getOpcode();
   // Use 64 bit encoding to allow use of VOP3 instructions.
   // VOP3 e64 instructions allow source modifiers
@@ -541,7 +521,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
   case AMDGPU::V_PK_FMA_F32:
     return AMDGPU::V_FMA_F32_e64;
   default:
-    return std::numeric_limits<uint16_t>::max();
+    return std::numeric_limits<uint32_t>::max();
   }
   llvm_unreachable("Fully covered switch");
 }
@@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
+    uint32_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
+    bool IsUnpackable =
+        !(UnpackedOpCode == std::numeric_limits<uint32_t>::max());
     if (Instr.isMetaInstruction())
       continue;
     if ((Instr.isTerminator()) ||
-        (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+        (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
         (SIInstrInfo::modifiesModeRegister(Instr) &&
          Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
       return;
@@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
       if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
         return;
     }
-    if (!isUnpackingSupportedInstr(Instr))
+    if (!IsUnpackable)
       continue;
 
     if (canUnpackingClobberRegister(Instr))
@@ -657,10 +640,10 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 }
 
 void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
-  MachineOperand DstOp = I.getOperand(0);
+  const MachineOperand &DstOp = I.getOperand(0);
 
-  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
-  assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
+  uint32_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  assert(UnpackedOpcode != std::numeric_limits<uint32_t>::max() &&
          "Unsupported Opcode");
 
   MachineInstrBuilder Op0LOp1L =
@@ -683,12 +666,12 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
 }
 
 MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
-                                                        uint16_t UnpackedOpcode,
+                                                        uint32_t UnpackedOpcode,
                                                         bool IsHiBits) {
   MachineBasicBlock &MBB = *I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
-  const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+  const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
   Register DstReg = I.getOperand(0).getReg();
   unsigned OpCode = I.getOpcode();
   Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
 
   MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   NewMI.addDef(UnpackedDstReg); // vdst
-  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
-  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
 
   if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
-    const MachineOperand *SrcMO3 =
+    const MachineOperand *SrcMO2 =
         TII->getNamedOperand(I, AMDGPU::OpName::src2);
     unsigned Src2Mods =
         TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
-    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
   }
   NewMI.addImm(ClampVal); // clamp
   // Packed instructions do not support output modifiers. safe to assign them 0
@@ -722,10 +705,17 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
 PreservedAnalyses
 llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
                                  MachineFunctionAnalysisManager &MFAM) {
-  if (!SIPreEmitPeephole().run(MF))
-    return PreservedAnalyses::all();
+  auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
+  auto *MPDT = MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
+
+  if (SIPreEmitPeephole().run(MF))
+    return getMachineFunctionPassPreservedAnalyses();
 
-  return getMachineFunctionPassPreservedAnalyses();
+  if (MDT)
+    MDT->updateBlockNumbers();
+  if (MPDT)
+    MPDT->updateBlockNumbers();
+  return PreservedAnalyses::all();
 }
 
 bool SIPreEmitPeephole::run(MachineFunction &MF) {
@@ -787,9 +777,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
 
   // TODO: Fold this into previous block, if possible. Evaluate and handle any
   // side effects.
+
+  // Perform the extra MF scans only for supported archs
+  if (!ST.hasGFX940Insts())
+    return Changed;
   for (MachineBasicBlock &MBB : MF) {
-    // Unpack packed instructions overlapped by MFMAs. This allows the compiler
-    // to co-issue unpacked instructions with MFMA
+    // Unpack packed instructions overlapped by MFMAs. This allows the
+    // compiler to co-issue unpacked instructions with MFMA
     auto SchedModel = TII->getSchedModel();
     SetVector<MachineInstr *> InstrsToUnpack;
     for (auto &MI : make_early_inc_range(MBB.instrs())) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ebd2e7e..ee46157 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -340,10 +340,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
          "getNumCoveredRegs() will not work with generated subreg masks!");
 
   RegPressureIgnoredUnits.resize(getNumRegUnits());
-  RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
+  RegPressureIgnoredUnits.set(
+      static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
   for (auto Reg : AMDGPU::VGPR_16RegClass) {
     if (AMDGPU::isHi16Reg(Reg, *this))
-      RegPressureIgnoredUnits.set(*regunits(Reg).begin());
+      RegPressureIgnoredUnits.set(
+          static_cast<unsigned>(*regunits(Reg).begin()));
   }
 
   // HACK: Until this is fully tablegen'd.
@@ -864,7 +866,8 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
     [[fallthrough]];
   }
   case AMDGPU::V_ADD_U32_e64:
-    // FIXME: This optimization is barely profitable enableFlatScratch as-is.
+    // FIXME: This optimization is barely profitable hasFlatScratchEnabled
+    // as-is.
     //
     // Much of the benefit with the MUBUF handling is we avoid duplicating the
     // shift of the frame register, which isn't needed with scratch.
@@ -872,7 +875,7 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
     // materializeFrameBaseRegister doesn't know the register classes of the
     // uses, and unconditionally uses an s_add_i32, which will end up using a
     // copy for the vector uses.
-    return !ST.enableFlatScratch();
+    return !ST.hasFlatScratchEnabled();
   case AMDGPU::V_ADD_CO_U32_e32:
     if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
         !isFIPlusImmOrVGPR(*this, *MI))
@@ -912,12 +915,12 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   MachineFunction *MF = MBB->getParent();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
-                                           : AMDGPU::V_MOV_B32_e32;
+  unsigned MovOpc =
+      ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
 
   Register BaseReg = MRI.createVirtualRegister(
-      ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
-                             : &AMDGPU::VGPR_32RegClass);
+      ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
+                                 : &AMDGPU::VGPR_32RegClass);
 
   if (Offset == 0) {
     BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
@@ -927,16 +930,16 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 
   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
-  Register FIReg = MRI.createVirtualRegister(
-      ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
-                             : &AMDGPU::VGPR_32RegClass);
+  Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
+                                                 ? &AMDGPU::SReg_32_XM0RegClass
+                                                 : &AMDGPU::VGPR_32RegClass);
 
   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
     .addImm(Offset);
   BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
     .addFrameIndex(FrameIdx);
 
-  if (ST.enableFlatScratch() ) {
+  if (ST.hasFlatScratchEnabled()) {
     // FIXME: Make sure scc isn't live in.
     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
         .addReg(OffsetReg, RegState::Kill)
@@ -989,9 +992,9 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
     MachineRegisterInfo &MRI = MF->getRegInfo();
 
     // FIXME: materializeFrameBaseRegister does not know the register class of
-    // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
-    // a copy so we have a legal operand and hope the register coalescer can
-    // clean it up.
+    // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
+    // Emit a copy so we have a legal operand and hope the register coalescer
+    // can clean it up.
     if (isSGPRReg(MRI, BaseReg)) {
       Register BaseRegVGPR =
           MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -1407,7 +1410,7 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
   unsigned Dst = IsStore ? Reg : ValueReg;
   unsigned Src = IsStore ? ValueReg : Reg;
   bool IsVGPR = TRI->isVGPR(MRI, Reg);
-  DebugLoc DL = MI->getDebugLoc();
+  const DebugLoc &DL = MI->getDebugLoc();
   if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
     // Spiller during regalloc may restore a spilled register to its superclass.
     // It could result in AGPR spills restored to VGPRs or the other way around,
@@ -1546,7 +1549,10 @@ void SIRegisterInfo::buildSpillLoadStore(
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
   int64_t MaterializedOffset = Offset;
 
-  int64_t MaxOffset = Offset + Size + RemSize - EltSize;
+  // Maxoffset is the starting offset for the last chunk to be spilled.
+  // In case of non-zero remainder element, max offset will be the
+  // last address(offset + Size) after spilling  all the EltSize chunks.
+  int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
   int64_t ScratchOffsetRegDelta = 0;
 
   if (IsFlat && EltSize > 4) {
@@ -1730,8 +1736,8 @@ void SIRegisterInfo::buildSpillLoadStore(
             : Register(getSubReg(ValueReg,
                                  getSubRegFromChannel(RegOffset / 4, NumRegs)));
 
-    unsigned SOffsetRegState = 0;
-    unsigned SrcDstRegState = getDefRegState(!IsStore);
+    RegState SOffsetRegState = {};
+    RegState SrcDstRegState = getDefRegState(!IsStore);
     const bool IsLastSubReg = i + 1 == e;
     const bool IsFirstSubReg = i == 0;
     if (IsLastSubReg) {
@@ -1771,7 +1777,7 @@ void SIRegisterInfo::buildSpillLoadStore(
       }
       if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
         NeedSuperRegImpOperand = true;
-        unsigned State = SrcDstRegState;
+        RegState State = SrcDstRegState;
         if (!IsLastSubReg || (Lane != LaneE))
           State &= ~RegState::Kill;
         if (!IsFirstSubReg || (Lane != LaneS))
@@ -1823,10 +1829,22 @@ void SIRegisterInfo::buildSpillLoadStore(
       }
     }
 
+    Register FinalValueReg = ValueReg;
+    if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
+      // If we are loading 16-bit value with SRAMECC endabled we need a temp
+      // 32-bit VGPR to load and extract 16-bits into the final register.
+      ValueReg =
+          RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
+      SubReg = ValueReg;
+      IsKill = false;
+    }
+
+    // Create the MMO, additional set the NonVolatile flag as scratch memory
+    // used for spills will not be used outside the thread.
     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
-    MachineMemOperand *NewMMO =
-        MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
-                                 commonAlignment(Alignment, RegOffset));
+    MachineMemOperand *NewMMO = MF->getMachineMemOperand(
+        PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
+        commonAlignment(Alignment, RegOffset));
 
     auto MIB =
         BuildMI(MBB, MI, DL, *Desc)
@@ -1863,6 +1881,17 @@ void SIRegisterInfo::buildSpillLoadStore(
       MIB.addImm(0); // swz
     MIB.addMemOperand(NewMMO);
 
+    if (FinalValueReg != ValueReg) {
+      // Extract 16-bit from the loaded 32-bit value.
+      ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
+      MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
+                .addReg(FinalValueReg, getDefRegState(true))
+                .addImm(0)
+                .addReg(ValueReg, getKillRegState(true))
+                .addImm(0);
+      ValueReg = FinalValueReg;
+    }
+
     if (!IsAGPR && NeedSuperRegDef)
       MIB.addReg(ValueReg, RegState::ImplicitDefine);
 
@@ -1873,10 +1902,14 @@ void SIRegisterInfo::buildSpillLoadStore(
       MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
     }
 
-    bool IsSrcDstDef = SrcDstRegState & RegState::Define;
+    bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
+    bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
     if (NeedSuperRegImpOperand &&
-        (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef)))
+        (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+      if (PartialReloadCopy)
+        MIB.addReg(ValueReg, RegState::Implicit);
+    }
 
     // The epilog restore of a wwm-scratch register can cause undesired
     // optimization during machine-cp post PrologEpilogInserter if the same
@@ -1924,7 +1957,7 @@ void SIRegisterInfo::buildSpillLoadStore(
 
 void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
                                                     Register BlockReg) const {
-  const MachineFunction *MF = MIB->getParent()->getParent();
+  const MachineFunction *MF = MIB->getMF();
   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
   uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
   Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
@@ -1953,13 +1986,15 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
       SB.EltSize, Alignment);
 
   if (IsLoad) {
-    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
-                                          : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+    unsigned Opc = ST.hasFlatScratchEnabled()
+                       ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+                       : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
                         FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
   } else {
-    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
-                                          : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+    unsigned Opc = ST.hasFlatScratchEnabled()
+                       ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                       : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
                         FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
     // This only ever adds one VGPR spill
@@ -2039,13 +2074,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
     SB.prepare();
 
     // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
-    unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
+    RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
 
     // Per VGPR helper data
     auto PVD = SB.getPerVGPRData();
 
     for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
-      unsigned TmpVGPRFlags = RegState::Undef;
+      RegState TmpVGPRFlags = RegState::Undef;
 
       // Write sub registers into the VGPR
       for (unsigned i = Offset * PVD.PerVGPR,
@@ -2062,7 +2097,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
                 .addReg(SubReg, SubKillState)
                 .addImm(i % PVD.PerVGPR)
                 .addReg(SB.TmpVGPR, TmpVGPRFlags);
-        TmpVGPRFlags = 0;
+        TmpVGPRFlags = {};
 
         if (Indexes) {
           if (i == 0)
@@ -2075,7 +2110,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
         // TODO: Can we detect this and skip the spill?
         if (SB.NumSubRegs > 1) {
           // The last implicit use of the SB.SuperReg carries the "Kill" flag.
-          unsigned SuperKillState = 0;
+          RegState SuperKillState = {};
           if (i + 1 == SB.NumSubRegs)
             SuperKillState |= getKillRegState(SB.IsKill);
           WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
@@ -2185,10 +2220,10 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
                       RS);
   SB.prepare();
   // Generate the spill of SGPR to SB.TmpVGPR.
-  unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
+  RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
   auto PVD = SB.getPerVGPRData();
   for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
-    unsigned TmpVGPRFlags = RegState::Undef;
+    RegState TmpVGPRFlags = RegState::Undef;
     // Write sub registers into the VGPR
     for (unsigned i = Offset * PVD.PerVGPR,
                   e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
@@ -2204,12 +2239,12 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
               .addReg(SubReg, SubKillState)
               .addImm(i % PVD.PerVGPR)
               .addReg(SB.TmpVGPR, TmpVGPRFlags);
-      TmpVGPRFlags = 0;
+      TmpVGPRFlags = {};
       // There could be undef components of a spilled super register.
       // TODO: Can we detect this and skip the spill?
       if (SB.NumSubRegs > 1) {
         // The last implicit use of the SB.SuperReg carries the "Kill" flag.
-        unsigned SuperKillState = 0;
+        RegState SuperKillState = {};
         if (i + 1 == SB.NumSubRegs)
           SuperKillState |= getKillRegState(SB.IsKill);
         WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
@@ -2294,7 +2329,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                         int SPAdj, unsigned FIOperandNum,
                                         RegScavenger *RS) const {
-  MachineFunction *MF = MI->getParent()->getParent();
+  MachineFunction *MF = MI->getMF();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
@@ -2415,13 +2450,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
       unsigned Opc;
       if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
-        assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
+        assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
         Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
       } else {
         Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
                   ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
-              : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
-                                       : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+              : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
       }
 
       auto *MBB = MI->getParent();
@@ -2500,13 +2535,15 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
       unsigned Opc;
       if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
-        assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
-        Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+        assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
+        Opc = ST.d16PreservesUnusedBits()
+                  ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
+                  : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
       } else {
         Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
                   ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
-              : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
-                                       : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+              : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
       }
 
       auto *MBB = MI->getParent();
@@ -2585,7 +2622,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         Offset = 0;
       }
 
-      if (FrameReg && !ST.enableFlatScratch()) {
+      if (FrameReg && !ST.hasFlatScratchEnabled()) {
         // We should just do an in-place update of the result register. However,
         // the value there may also be used by the add, in which case we need a
         // temporary register.
@@ -2606,7 +2643,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       }
 
       if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
-        if (ST.enableFlatScratch() &&
+        if (ST.hasFlatScratchEnabled() &&
             !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
           // We didn't need the shift above, so we have an SGPR for the frame
           // register, but may have a VGPR only operand.
@@ -2624,7 +2661,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
           BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
               .addReg(MaterializedReg,
-                      MaterializedReg != FrameReg ? RegState::Kill : 0);
+                      getKillRegState(MaterializedReg != FrameReg));
           MaterializedReg = ScavengedVGPR;
         }
 
@@ -2636,8 +2673,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (NumDefs == 2)
           AddI32.add(MI->getOperand(1));
 
-        unsigned MaterializedRegFlags =
-            MaterializedReg != FrameReg ? RegState::Kill : 0;
+        RegState MaterializedRegFlags =
+            getKillRegState(MaterializedReg != FrameReg);
 
         if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
           // If we know we have a VGPR already, it's more likely the other
@@ -2767,7 +2804,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
         TmpReg = DstOp.getReg();
 
-      if (FrameReg && !ST.enableFlatScratch()) {
+      if (FrameReg && !ST.hasFlatScratchEnabled()) {
         // FIXME: In the common case where the add does not also read its result
         // (i.e. this isn't a reg += fi), it's not finding the dest reg as
         // available.
@@ -2852,7 +2889,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     }
 
     int64_t Offset = FrameInfo.getObjectOffset(Index);
-    if (ST.enableFlatScratch()) {
+    if (ST.hasFlatScratchEnabled()) {
       if (TII->isFLATScratch(*MI)) {
         assert(
             (int16_t)FIOperandNum ==
@@ -2954,10 +2991,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                   : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
                                                   MI, false, 0, !UseSGPR);
 
-      // TODO: for flat scratch another attempt can be made with a VGPR index
-      //       if no SGPRs can be scavenged.
-      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
+        int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
+        if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
+          Register TmpVGPR = RS->scavengeRegisterBackwards(
+              AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+          // Materialize the frame register.
+          auto MIB =
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
+          if (FrameReg)
+            MIB.addReg(FrameReg);
+          else
+            MIB.addImm(Offset);
+
+          // Add the offset to the frame register.
+          if (FrameReg && Offset)
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
+                .addReg(FrameReg, RegState::Kill)
+                .addImm(Offset);
+
+          BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
+              .add(MI->getOperand(0)) // $vdata
+              .addReg(TmpVGPR)        // $vaddr
+              .addImm(0)              // Offset
+              .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
+          MI->eraseFromParent();
+          return true;
+        }
         report_fatal_error("Cannot scavenge register in FI elimination!");
+      }
 
       if (!TmpSReg) {
         // Use frame register and restore it after.
@@ -3019,7 +3082,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     if (!IsMUBUF && !MFI->isBottomOfStack()) {
       // Convert to a swizzled stack address by scaling by the wave size.
       // In an entry function/kernel the offset is already swizzled.
-      bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+      bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
       bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
                      !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
       const TargetRegisterClass *RC = IsSALU && !LiveSCC
@@ -3531,6 +3594,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
 }
 
 const TargetRegisterClass *
+SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const {
+  // TODO: In principle this should use AV classes for gfx908 too. This is
+  // limited to 90a+ to avoid regressing special case copy optimizations which
+  // need new handling. The core issue is that it's not possible to directly
+  // copy between AGPRs on gfx908, and the current optimizations around that
+  // expect to see copies to VGPR.
+  return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
+                             : getVGPRClassForBitWidth(BitWidth);
+}
+
+const TargetRegisterClass *
 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
   if (BitWidth == 16 || BitWidth == 32)
     return &AMDGPU::SReg_32RegClass;
@@ -3601,6 +3675,14 @@ SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
 }
 
 const TargetRegisterClass *
+SIRegisterInfo::getEquivalentAVClass(const TargetRegisterClass *SRC) const {
+  unsigned Size = getRegSizeInBits(*SRC);
+  const TargetRegisterClass *ARC = getVectorSuperClassForBitWidth(Size);
+  assert(ARC && "Invalid register class size");
+  return ARC;
+}
+
+const TargetRegisterClass *
 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
   unsigned Size = getRegSizeInBits(*VRC);
   if (Size == 32)
@@ -3707,27 +3789,6 @@ bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
   return RC && isAGPRClass(RC);
 }
 
-bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
-                                    const TargetRegisterClass *SrcRC,
-                                    unsigned SubReg,
-                                    const TargetRegisterClass *DstRC,
-                                    unsigned DstSubReg,
-                                    const TargetRegisterClass *NewRC,
-                                    LiveIntervals &LIS) const {
-  unsigned SrcSize = getRegSizeInBits(*SrcRC);
-  unsigned DstSize = getRegSizeInBits(*DstRC);
-  unsigned NewSize = getRegSizeInBits(*NewRC);
-
-  // Do not increase size of registers beyond dword, we would need to allocate
-  // adjacent registers and constraint regalloc more than needed.
-
-  // Always allow dword coalescing.
-  if (SrcSize <= 32 || DstSize <= 32)
-    return true;
-
-  return NewSize <= DstSize || NewSize <= SrcSize;
-}
-
 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                              MachineFunction &MF) const {
   unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
@@ -3761,10 +3822,10 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
   llvm_unreachable("Unexpected register pressure set!");
 }
 
-const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
+const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
   static const int Empty[] = { -1 };
 
-  if (RegPressureIgnoredUnits[RegUnit])
+  if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
     return Empty;
 
   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
@@ -3888,20 +3949,6 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
                                 : &AMDGPU::VReg_64RegClass;
 }
 
-const TargetRegisterClass *
-SIRegisterInfo::getRegClass(unsigned RCID) const {
-  switch ((int)RCID) {
-  case AMDGPU::SReg_1RegClassID:
-    return getBoolRC();
-  case AMDGPU::SReg_1_XEXECRegClassID:
-    return getWaveMaskRegClass();
-  case -1:
-    return nullptr;
-  default:
-    return AMDGPUGenRegisterInfo::getRegClass(RCID);
-  }
-}
-
 // Find reaching register definition
 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
                                               MachineInstr &Use,
@@ -3990,28 +4037,6 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
   return true;
 }
 
-const TargetRegisterClass *
-SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
-  if (!RC || !ST.needsAlignedVGPRs())
-    return RC;
-
-  unsigned Size = getRegSizeInBits(*RC);
-  if (Size <= 32)
-    return RC;
-
-  if (RC == &AMDGPU::VS_64RegClass)
-    return &AMDGPU::VS_64_Align2RegClass;
-
-  if (isVGPRClass(RC))
-    return getAlignedVGPRClassForBitWidth(Size);
-  if (isAGPRClass(RC))
-    return getAlignedAGPRClassForBitWidth(Size);
-  if (isVectorSuperClass(RC))
-    return getAlignedVectorSuperClassForBitWidth(Size);
-
-  return RC;
-}
-
 ArrayRef<MCPhysReg>
 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
   return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7b91ba7..9d1a9ea 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -107,9 +107,7 @@ public:
 
   // Stack access is very expensive. CSRs are also the high registers, and we
   // want to minimize the number of used registers.
-  unsigned getCSRFirstUseCost() const override {
-    return 100;
-  }
+  unsigned getCSRCost() const override { return 100; }
 
   // When building a block VGPR load, we only really transfer a subset of the
   // registers in the block, based on a mask. Liveness analysis is not aware of
@@ -216,6 +214,10 @@ public:
   getVectorSuperClassForBitWidth(unsigned BitWidth) const;
 
   LLVM_READONLY
+  const TargetRegisterClass *
+  getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const;
+
+  LLVM_READONLY
   static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
 
   /// \returns true if this class contains only SGPR registers
@@ -285,6 +287,10 @@ public:
   const TargetRegisterClass *
   getEquivalentAGPRClass(const TargetRegisterClass *SRC) const;
 
+  /// \returns An AGPR+VGPR super reg class with the same width as \p SRC
+  const TargetRegisterClass *
+  getEquivalentAVClass(const TargetRegisterClass *SRC) const;
+
   /// \returns A SGPR reg class with the same width as \p SRC
   const TargetRegisterClass *
   getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
@@ -338,14 +344,6 @@ public:
   ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
                                      unsigned EltSize) const;
 
-  bool shouldCoalesce(MachineInstr *MI,
-                      const TargetRegisterClass *SrcRC,
-                      unsigned SubReg,
-                      const TargetRegisterClass *DstRC,
-                      unsigned DstSubReg,
-                      const TargetRegisterClass *NewRC,
-                      LiveIntervals &LIS) const override;
-
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
@@ -357,7 +355,7 @@ public:
                              const MachineFunction &MF, const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
 
-  const int *getRegUnitPressureSets(unsigned RegUnit) const override;
+  const int *getRegUnitPressureSets(MCRegUnit RegUnit) const override;
 
   MCRegister getReturnAddressReg(const MachineFunction &MF) const;
 
@@ -391,8 +389,6 @@ public:
 
   MCRegister getExec() const;
 
-  const TargetRegisterClass *getRegClass(unsigned RCID) const;
-
   // Find reaching register definition
   MachineInstr *findReachingDef(Register Reg, unsigned SubReg,
                                 MachineInstr &Use,
@@ -433,11 +429,6 @@ public:
   // the subtarget.
   bool isProperlyAlignedRC(const TargetRegisterClass &RC) const;
 
-  // Given \p RC returns corresponding aligned register class if required
-  // by the subtarget.
-  const TargetRegisterClass *
-  getProperlyAlignedRC(const TargetRegisterClass *RC) const;
-
   /// Return all SGPR128 which satisfy the waves per execution unit requirement
   /// of the subtarget.
   ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
@@ -495,6 +486,17 @@ public:
 
   SmallVector<StringLiteral>
   getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+  float
+  getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
+    // Prioritize VGPR_32_Lo256 over other classes which may occupy registers
+    // beyond v256.
+    return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
+           ((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
+             RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
+                ? 2.0
+                : 1.0);
+  }
 };
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index fc8f46a..493e267 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -614,9 +614,9 @@ def VGPR_16 : SIRegisterClass<"AMDGPU",  Reg16Types.types, 16,
 def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU",  Reg16Types.types, 16,
                             (add (interleave (sequence "VGPR%u_LO16", 0, 127),
                                              (sequence "VGPR%u_HI16", 0, 127)))> {
+  let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Size = 16;
   let GeneratePressureSet = 0;
-  let isAllocatable = 0;
 
   // This is the base class for VGPR{0..127}_{LO16,HI16}.
   let BaseClassOrder = 16;
@@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
 // Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
 def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                                     (add (sequence "VGPR%u", 0, 255))> {
-  let AllocationPriority = 0;
+  let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
   let GeneratePressureSet = 0;
   let Size = 32;
   let Weight = 1;
@@ -896,20 +896,6 @@ def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v
   let Size = 64;
 }
 
-def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32,
-  (add SReg_64_XEXEC, SReg_32_XEXEC)> {
-  let CopyCost = 1;
-  let isAllocatable = 0;
-  let HasSGPR = 1;
-}
-
-def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
-  (add SReg_1_XEXEC, EXEC, EXEC_LO, EXEC_HI)> {
-  let CopyCost = 1;
-  let isAllocatable = 0;
-  let HasSGPR = 1;
-}
-
 multiclass SRegClass<int numRegs,
                      list<ValueType> regTypes,
                      SIRegisterTuples regList,
@@ -1205,6 +1191,34 @@ defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>;
 defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>;
 }
 
+def SReg_1_XEXEC : SIRegisterClassLike<0, false, false, true>,
+  RegClassByHwMode<
+    [DefaultMode_Wave64,
+     AlignedVGPRNoAGPRMode_Wave64,
+     AVAlign2LoadStoreMode,
+     DefaultMode_Wave32,
+     AlignedVGPRNoAGPRMode_Wave32],
+    [SReg_64_XEXEC,
+     SReg_64_XEXEC,
+     SReg_64_XEXEC,
+     SReg_32_XM0_XEXEC, // FIXME: Why do the wave32 cases exclude m0?
+     SReg_32_XM0_XEXEC]
+>;
+
+def SReg_1 : SIRegisterClassLike<0, false, false, true>,
+  RegClassByHwMode<
+    [DefaultMode_Wave64,
+     AlignedVGPRNoAGPRMode_Wave64,
+     AVAlign2LoadStoreMode,
+     DefaultMode_Wave32,
+     AlignedVGPRNoAGPRMode_Wave32],
+    [SReg_64,
+     SReg_64,
+     SReg_64,
+     SReg_32,
+     SReg_32]
+>;
+
 //===----------------------------------------------------------------------===//
 //
 //  AlignTarget classes. Artifical classes to swap between
@@ -1212,17 +1226,36 @@ defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_102
 //
 //===----------------------------------------------------------------------===//
 
+// We have 3 orthogonal properties to consider. Unfortunately we need
+// to define the cross product of these states, minus unused
+// combinations.
+
 def AV_LdSt_32_Target : RegClassByHwMode<
-    [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
-    [VGPR_32, AV_32, VGPR_32]>, SIRegisterClassLike<32, true, true> {
+    [DefaultMode_Wave64,
+     DefaultMode_Wave32,
+     AVAlign2LoadStoreMode,
+     AlignedVGPRNoAGPRMode_Wave64,
+     AlignedVGPRNoAGPRMode_Wave32],
+    [VGPR_32,
+     VGPR_32,
+     AV_32,
+     VGPR_32,
+     VGPR_32]>,
+    SIRegisterClassLike<32, true, true> {
   let DecoderMethod = "decodeAVLdSt";
 }
 
 foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 1024 ] in {
   def VReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave64,
+       DefaultMode_Wave32,
+       AVAlign2LoadStoreMode,
+       AlignedVGPRNoAGPRMode_Wave64,
+       AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("VReg_"#RegSize),
+       !cast<RegisterClass>("VReg_"#RegSize),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
      let DecoderMethod = "DecodeVReg_"#RegSize#"RegisterClass";
@@ -1230,45 +1263,59 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
 
   def AReg_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, false, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, /*Unused combination*/],
+      [DefaultMode_Wave64, /*unused combination*/ AVAlign2LoadStoreMode, /*Unused combination*/ /*Unused combination*/],
       [!cast<RegisterClass>("AReg_"#RegSize),
+       /*unused combination*/
        !cast<RegisterClass>("AReg_"#RegSize#_Align2)
+       /*Unused combination*/
        /*Unused combination*/]> {
      let DecoderMethod = "DecodeAReg_"#RegSize#"RegisterClass";
    }
 
   def AV_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave32,
+       DefaultMode_Wave64,
+       AVAlign2LoadStoreMode,
+       AlignedVGPRNoAGPRMode_Wave64,
+       AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("AV_"#RegSize),
+       !cast<RegisterClass>("AV_"#RegSize),
        !cast<RegisterClass>("AV_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
      let DecoderMethod = "DecodeAV_"#RegSize#"RegisterClass";
   }
 
   def AV_LdSt_#RegSize#_AlignTarget : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("VReg_"#RegSize),
+       !cast<RegisterClass>("VReg_"#RegSize),
        !cast<RegisterClass>("AV_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
     let DecoderMethod = "decodeAVLdSt";
   }
 
   def AV_LdSt_#RegSize#_Align2 : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+      [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
       [!cast<RegisterClass>("VReg_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("AV_"#RegSize#_Align2),
+       !cast<RegisterClass>("VReg_"#RegSize#_Align2),
        !cast<RegisterClass>("VReg_"#RegSize#_Align2)]> {
     let DecoderMethod = "decodeAVLdSt";
   }
 
   def AV_LdSt_#RegSize#_Align1 : SIRegisterClassLike<RegSize, true, true>,
     RegClassByHwMode<
-        [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
+        [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
         [!cast<RegisterClass>("VReg_"#RegSize),
+         !cast<RegisterClass>("VReg_"#RegSize),
          !cast<RegisterClass>("AV_"#RegSize),
+         !cast<RegisterClass>("VReg_"#RegSize),
          !cast<RegisterClass>("VReg_"#RegSize)]> {
     let DecoderMethod = "decodeAVLdSt";
   }
@@ -1276,11 +1323,22 @@ foreach RegSize = [ 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 512, 10
 
 def VS_64_AlignTarget : SIRegisterClassLike<64, true, false, true>,
   RegClassByHwMode<
-      [DefaultMode, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode],
-      [VS_64, VS_64_Align2, VS_64_Align2]> {
+      [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+      [VS_64,              VS_64,              VS_64_Align2,          VS_64_Align2,                 VS_64_Align2]> {
   let DecoderMethod = "decodeSrcRegOrImm9";
 }
 
+
+// Special case for DS_GWS instructions. The register input is really
+// 32-bit, but it needs to be even aligned on targets with a VGPR
+// alignment requirement.
+def AV_LdSt_32_Align2 : SIRegisterClassLike</*Bitwidth=*/32, /*VGPR=*/true, /*AGPR=*/true>,
+                        RegClassByHwMode<
+  [DefaultMode_Wave64, DefaultMode_Wave32, AVAlign2LoadStoreMode, AlignedVGPRNoAGPRMode_Wave64, AlignedVGPRNoAGPRMode_Wave32],
+  [VGPR_32,            VGPR_32,            AV_64_Align2,          VReg_64_Align2,               VReg_64_Align2]> {
+  let DecoderMethod = "decodeAVLdSt<32>";
+}
+
 class RegImmMatcher<string name> : AsmOperandClass {
   let Name = name;
   let RenderMethod = "addRegOrImmOperands";
@@ -1314,12 +1372,12 @@ class SrcRegOrImm9_t16<string operandType, RegisterClass regClass = VS_16>
   let EncoderMethod = "getMachineOpValueT16";
 }
 
-def SSrc_b16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT16">;
-def SSrc_bf16: SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_BF16">;
-def SSrc_f16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP16">;
-def SSrc_b32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT32">;
-def SSrc_f32 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP32">;
-def SSrc_b64 : SrcRegOrImm9 <SReg_64_Encodable, "OPERAND_REG_IMM_INT64">;
+def SSrc_b16  : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT16">;
+def SSrc_bf16 : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_BF16">;
+def SSrc_f16  : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP16">;
+def SSrc_b32  : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_INT32">;
+def SSrc_f32  : SrcRegOrImm9 <SReg_32, "OPERAND_REG_IMM_FP32">;
+def SSrc_b64  : SrcRegOrImm9 <SReg_64_Encodable, "OPERAND_REG_IMM_INT64">;
 
 def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPERAND_REG_IMM_INT32">;
 
@@ -1335,35 +1393,35 @@ def SCSrc_b64 : SrcRegOrImm9 <SReg_64, "OPERAND_REG_INLINE_C_INT64">;
 //===----------------------------------------------------------------------===//
 
 // The current and temporary future default used case for VOP3.
-def VSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT16">;
-def VSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_BF16">;
-def VSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP16">;
+def VSrc_b16   : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT16">;
+def VSrc_bf16  : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_BF16">;
+def VSrc_f16   : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP16">;
 
 // True16 VOP3 operands.
-def VSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16">;
+def VSrcT_b16  : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16">;
 def VSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16">;
-def VSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16">;
+def VSrcT_f16  : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16">;
 
 // True16 VOP1/2/C operands.
 let DecoderMethodName = "decodeOperand_VSrcT16_Lo128", EncoderMethod = "getMachineOpValueT16Lo128" in {
-  def VSrcT_b16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16", VS_16_Lo128>;
-  def VSrcT_bf16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16", VS_16_Lo128>;
-  def VSrcT_f16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16", VS_16_Lo128>;
+  def VSrcT_b16_Lo128  : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_INT16", VS_16_Lo128>;
+  def VSrcT_bf16_Lo128 : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_BF16",  VS_16_Lo128>;
+  def VSrcT_f16_Lo128  : SrcRegOrImm9_t16 <"OPERAND_REG_IMM_FP16",  VS_16_Lo128>;
 } // End DecoderMethodName = "decodeOperand_VSrcT16_Lo128", EncoderMethod = "getMachineOpValueT16Lo128"
 
 // The current and temporary future default used case for fake VOP1/2/C.
 // For VOP1,2,C True16 instructions. _Lo128 use first 128 32-bit VGPRs only.
-def VSrcFake16_b16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_INT16">;
+def VSrcFake16_b16_Lo128  : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_INT16">;
 def VSrcFake16_bf16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_BF16">;
-def VSrcFake16_f16_Lo128 : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_FP16">;
+def VSrcFake16_f16_Lo128  : SrcRegOrImm9 <VS_32_Lo128, "OPERAND_REG_IMM_FP16">;
 
-def VSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT32">;
-def VSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">;
-def VSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">;
+def VSrc_b32    : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_INT32">;
+def VSrc_f32    : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_FP32">;
+def VSrc_v2b16  : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2INT16">;
 def VSrc_v2bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2BF16">;
-def VSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">;
-def VSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">;
-def VSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> {
+def VSrc_v2f16  : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16">;
+def VSrc_b64    : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_INT64">;
+def VSrc_f64    : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_FP64"> {
   let DecoderMethod = "decodeOperand_VSrc_f64";
 }
 def VSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2INT32">;
@@ -1371,6 +1429,8 @@ def VSrc_v2f32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_IMM_V2FP32">;
 
 def VSrc_NoInline_v2f16  : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_NOINLINE_V2FP16">;
 
+def VSrc_v2f16_splat : SrcRegOrImm9 <VS_32, "OPERAND_REG_IMM_V2FP16_SPLAT">;
+
 //===----------------------------------------------------------------------===//
 //  VRegSrc_* Operands with a VGPR
 //===----------------------------------------------------------------------===//
@@ -1381,15 +1441,15 @@ class SrcReg9<RegisterClassLike regClass> : RegisterOperand<regClass> {
   let DecoderMethod = "decodeSrcReg9<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
 }
 
-def VRegSrc_32   : SrcReg9<VGPR_32>;
-def VRegSrc_64   : SrcReg9<VReg_64_AlignTarget>;
-def VRegSrc_96   : SrcReg9<VReg_96_AlignTarget>;
-def VRegSrc_128  : SrcReg9<VReg_128_AlignTarget>;
-def VRegSrc_192  : SrcReg9<VReg_192_AlignTarget>;
-def VRegSrc_256  : SrcReg9<VReg_256_AlignTarget>;
-def VRegSrc_384  : SrcReg9<VReg_384_AlignTarget>;
-def VRegSrc_512  : SrcReg9<VReg_512_AlignTarget>;
-def VRegSrc_1024 : SrcReg9<VReg_1024_AlignTarget>;
+def VRegSrc_32      : SrcReg9<VGPR_32>;
+def VRegSrc_64      : SrcReg9<VReg_64_AlignTarget>;
+def VRegSrc_96      : SrcReg9<VReg_96_AlignTarget>;
+def VRegSrc_128     : SrcReg9<VReg_128_AlignTarget>;
+def VRegSrc_192     : SrcReg9<VReg_192_AlignTarget>;
+def VRegSrc_256     : SrcReg9<VReg_256_AlignTarget>;
+def VRegSrc_384     : SrcReg9<VReg_384_AlignTarget>;
+def VRegSrc_512     : SrcReg9<VReg_512_AlignTarget>;
+def VRegSrc_1024    : SrcReg9<VReg_1024_AlignTarget>;
 def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
 
 // True 16 Operands
@@ -1454,44 +1514,44 @@ def ARegSrc_32 : AVOperand<AGPR_32, "decodeSrcA9">;
 //  VCSrc_* Operands with an SGPR, VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def VCSrc_b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT16">;
-def VCSrc_bf16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">;
-def VCSrc_f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">;
-def VCSrc_b32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">;
-def VCSrc_f32 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">;
-def VCSrc_b64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">;
-def VCSrc_f64 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
-def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
-def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
-def VCSrc_b32_Lo256 : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">;
-def VCSrc_v2b32 : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">;
-def VCSrc_b64_Lo256 : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_b16         : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT16">;
+def VCSrc_bf16        : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_BF16">;
+def VCSrc_f16         : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP16">;
+def VCSrc_b32         : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_INT32">;
+def VCSrc_f32         : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_FP32">;
+def VCSrc_v2b16       : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">;
+def VCSrc_v2bf16      : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">;
+def VCSrc_v2f16       : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">;
+def VCSrc_b32_Lo256   : SrcRegOrImm9 <VS_32_Lo256, "OPERAND_REG_INLINE_C_INT32">;
+def VCSrc_b64_Lo256   : SrcRegOrImm9 <VS_64_Lo256, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_b64         : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_INT64">;
+def VCSrc_f64         : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
+def VCSrc_v2b32       : SrcRegOrImm9 <VS_64_AlignTarget, "OPERAND_REG_INLINE_C_V2INT32">;
 
 // True 16 Operands
-def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
-def VCSrcT_bf16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_BF16">;
-def VCSrcT_f16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">;
+def VCSrcT_b16        : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">;
+def VCSrcT_bf16       : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_BF16">;
+def VCSrcT_f16        : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_FP16">;
 //===----------------------------------------------------------------------===//
 //  VISrc_* Operands with a VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
-def VISrc_64_bf16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
-def VISrc_64_f16 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
-def VISrc_64_b32 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_64_f64 : SrcRegOrImm9 <VReg_64_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_128_bf16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_BF16">;
-def VISrc_128_f16 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP16">;
-def VISrc_128_b32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_128_f32 : SrcRegOrImm9 <VReg_128_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_256_b32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_256_f32 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_256_f64 : SrcRegOrImm9 <VReg_256_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_512_b32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_512_f32 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
-def VISrc_512_f64 : SrcRegOrImm9 <VReg_512_AlignTarget, "OPERAND_REG_INLINE_C_FP64">;
-def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
-def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_64_bf16     : SrcRegOrImm9 <VReg_64_AlignTarget,   "OPERAND_REG_INLINE_C_BF16">;
+def VISrc_64_f16      : SrcRegOrImm9 <VReg_64_AlignTarget,   "OPERAND_REG_INLINE_C_FP16">;
+def VISrc_64_b32      : SrcRegOrImm9 <VReg_64_AlignTarget,   "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_64_f64      : SrcRegOrImm9 <VReg_64_AlignTarget,   "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_128_bf16    : SrcRegOrImm9 <VReg_128_AlignTarget,  "OPERAND_REG_INLINE_C_BF16">;
+def VISrc_128_f16     : SrcRegOrImm9 <VReg_128_AlignTarget,  "OPERAND_REG_INLINE_C_FP16">;
+def VISrc_128_b32     : SrcRegOrImm9 <VReg_128_AlignTarget,  "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_128_f32     : SrcRegOrImm9 <VReg_128_AlignTarget,  "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_256_b32     : SrcRegOrImm9 <VReg_256_AlignTarget,  "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_256_f32     : SrcRegOrImm9 <VReg_256_AlignTarget,  "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_256_f64     : SrcRegOrImm9 <VReg_256_AlignTarget,  "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_512_b32     : SrcRegOrImm9 <VReg_512_AlignTarget,  "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_512_f32     : SrcRegOrImm9 <VReg_512_AlignTarget,  "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_512_f64     : SrcRegOrImm9 <VReg_512_AlignTarget,  "OPERAND_REG_INLINE_C_FP64">;
+def VISrc_1024_b32    : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_INT32">;
+def VISrc_1024_f32    : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_FP32">;
 
 //===----------------------------------------------------------------------===//
 //  AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
@@ -1500,13 +1560,13 @@ def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024_AlignTarget, "OPERAND_REG_INLINE_C_
 class AVSrcOperand<RegisterClassLike regClass>
   : AVOperand<regClass, "decodeSrcAV10">;
 
-def AVSrc_32 : AVSrcOperand<AV_32>;
-def AVSrc_64 : AVSrcOperand<AV_64_AlignTarget>;
+def AVSrc_32  : AVSrcOperand<AV_32>;
+def AVSrc_64  : AVSrcOperand<AV_64_AlignTarget>;
 def AVSrc_128 : AVSrcOperand<AV_128_AlignTarget>;
 def AVSrc_192 : AVSrcOperand<AV_192_AlignTarget>;
 def AVSrc_256 : AVSrcOperand<AV_256_AlignTarget>;
 
-def AVSrc_64_Align2 : AVSrcOperand<AV_64_Align2>;
+def AVSrc_64_Align2  : AVSrcOperand<AV_64_Align2>;
 def AVSrc_128_Align2 : AVSrcOperand<AV_128_Align2>;
 def AVSrc_192_Align2 : AVSrcOperand<AV_192_Align2>;
 def AVSrc_256_Align2 : AVSrcOperand<AV_256_Align2>;
@@ -1528,11 +1588,22 @@ class AVLdStOperand<RegisterClassLike regClass>
 def AVLdSt_32 : AVLdStOperand<AV_LdSt_32_Target>;
 
 foreach size = ["64", "96", "128", "160", "256", "1024" ] in {
-  def AVLdSt_#size : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>;
+  def AVLdSt_#size         : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_AlignTarget)>;
   def AVLdSt_#size#_Align1 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align1)>;
   def AVLdSt_#size#_Align2 : AVLdStOperand<!cast<RegisterClassLike>("AV_LdSt_"#size#_Align2)>;
 }
 
+def AV_LdSt_32_Align2_RegMatcher : AsmOperandClass {
+  let Name = "AV_LdSt_32_Align2_RegOp";
+  let RenderMethod = "addRegOperands";
+}
+
+def AV_LdSt_32_Align2_RegOp : RegisterOperand<AV_LdSt_32_Align2> {
+  let ParserMatchClass = AV_LdSt_32_Align2_RegMatcher;
+  let PrintMethod = "printAVLdSt32Align2RegOp";
+  let EncoderMethod = "getAVOperandEncoding";
+}
+
 //===----------------------------------------------------------------------===//
 //  ACSrc_* Operands with an AGPR or an inline constant
 //===----------------------------------------------------------------------===//
@@ -1542,14 +1613,14 @@ class SrcRegOrImmA9<RegisterClassLike regClass, string operandType>
   let DecoderMethod = "decodeSrcRegOrImmA9<" # !cast<SIRegisterClassLike>(regClass).Size # ">";
 }
 
-def AISrc_64_f64 : SrcRegOrImmA9 <AReg_64_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
-def AISrc_128_f32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_128_b32 : SrcRegOrImmA9 <AReg_128_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
-def AISrc_256_f64 : SrcRegOrImmA9 <AReg_256_AlignTarget, "OPERAND_REG_INLINE_AC_FP64">;
-def AISrc_512_f32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_512_b32 : SrcRegOrImmA9 <AReg_512_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
-def AISrc_1024_f32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
-def AISrc_1024_b32 : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_64_f64    : SrcRegOrImmA9 <AReg_64_AlignTarget,   "OPERAND_REG_INLINE_AC_FP64">;
+def AISrc_128_f32   : SrcRegOrImmA9 <AReg_128_AlignTarget,  "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_128_b32   : SrcRegOrImmA9 <AReg_128_AlignTarget,  "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_256_f64   : SrcRegOrImmA9 <AReg_256_AlignTarget,  "OPERAND_REG_INLINE_AC_FP64">;
+def AISrc_512_f32   : SrcRegOrImmA9 <AReg_512_AlignTarget,  "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_512_b32   : SrcRegOrImmA9 <AReg_512_AlignTarget,  "OPERAND_REG_INLINE_AC_INT32">;
+def AISrc_1024_f32  : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_FP32">;
+def AISrc_1024_b32  : SrcRegOrImmA9 <AReg_1024_AlignTarget, "OPERAND_REG_INLINE_AC_INT32">;
 
 //===----------------------------------------------------------------------===//
 //  Tablegen programming utilities
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 179ecba..14ed778 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -27,6 +27,8 @@ using namespace llvm;
 
 namespace {
 
+enum ChangeKind { None, UpdateHint, UpdateInst };
+
 class SIShrinkInstructions {
   MachineFunction *MF;
   MachineRegisterInfo *MRI;
@@ -41,10 +43,10 @@ class SIShrinkInstructions {
   bool isKUImmOperand(const MachineOperand &Src) const;
   bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
   void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
-  void shrinkScalarCompare(MachineInstr &MI) const;
-  void shrinkMIMG(MachineInstr &MI) const;
-  void shrinkMadFma(MachineInstr &MI) const;
-  bool shrinkScalarLogicOp(MachineInstr &MI) const;
+  bool shrinkScalarCompare(MachineInstr &MI) const;
+  bool shrinkMIMG(MachineInstr &MI) const;
+  bool shrinkMadFma(MachineInstr &MI) const;
+  ChangeKind shrinkScalarLogicOp(MachineInstr &MI) const;
   bool tryReplaceDeadSDST(MachineInstr &MI) const;
   bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
                      Register Reg, unsigned SubReg) const;
@@ -241,27 +243,30 @@ void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
   }
 }
 
-void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
+bool SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
   if (!ST->hasSCmpK())
-    return;
+    return false;
 
   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
   // get constants on the RHS.
-  if (!MI.getOperand(0).isReg())
-    TII->commuteInstruction(MI, false, 0, 1);
+  bool Changed = false;
+  if (!MI.getOperand(0).isReg()) {
+    if (TII->commuteInstruction(MI, false, 0, 1))
+      Changed = true;
+  }
 
   // cmpk requires src0 to be a register
   const MachineOperand &Src0 = MI.getOperand(0);
   if (!Src0.isReg())
-    return;
+    return Changed;
 
   MachineOperand &Src1 = MI.getOperand(1);
   if (!Src1.isImm())
-    return;
+    return Changed;
 
   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
   if (SOPKOpc == -1)
-    return;
+    return Changed;
 
   // eq/ne is special because the imm16 can be treated as signed or unsigned,
   // and initially selected to the unsigned versions.
@@ -275,9 +280,10 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
       }
 
       MI.setDesc(TII->get(SOPKOpc));
+      Changed = true;
     }
 
-    return;
+    return Changed;
   }
 
   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
@@ -287,14 +293,16 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
     if (!SIInstrInfo::sopkIsZext(SOPKOpc))
       Src1.setImm(SignExtend64(Src1.getImm(), 32));
     MI.setDesc(NewDesc);
+    Changed = true;
   }
+  return Changed;
 }
 
 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
-void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
+bool SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
   if (!Info)
-    return;
+    return false;
 
   uint8_t NewEncoding;
   switch (Info->MIMGEncoding) {
@@ -305,7 +313,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
     NewEncoding = AMDGPU::MIMGEncGfx11Default;
     break;
   default:
-    return;
+    return false;
   }
 
   int VAddr0Idx =
@@ -359,7 +367,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
     } else if (Vgpr == NextVgpr) {
       NextVgpr = Vgpr + Dwords;
     } else {
-      return;
+      return false;
     }
 
     if (!Op.isUndef())
@@ -369,7 +377,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
   }
 
   if (VgprBase + NewAddrDwords > 256)
-    return;
+    return false;
 
   // Further check for implicit tied operands - this may be present if TFE is
   // enabled
@@ -408,21 +416,22 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
         ToUntie - (EndVAddr - 1));
   }
+  return true;
 }
 
 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
-void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
+bool SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
   // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
   // there is no reason to try to shrink them.
   if (!ST->hasVOP3Literal())
-    return;
+    return false;
 
   // There is no advantage to doing this pre-RA.
   if (!IsPostRA)
-    return;
+    return false;
 
   if (TII->hasAnyModifiersSet(MI))
-    return;
+    return false;
 
   const unsigned Opcode = MI.getOpcode();
   MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
@@ -439,7 +448,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
     else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
       Swap = true;
     else
-      return;
+      return false;
 
     switch (Opcode) {
     default:
@@ -477,7 +486,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
     else if (Src0.isImm() && !TII->isInlineConstant(Src0))
       Swap = true;
     else
-      return;
+      return false;
 
     switch (Opcode) {
     default:
@@ -509,10 +518,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
   }
 
   if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
-    return;
+    return false;
 
   if (AMDGPU::isTrue16Inst(NewOpcode) && !shouldShrinkTrue16(MI))
-    return;
+    return false;
 
   if (Swap) {
     // Swap Src0 and Src1 by building a new instruction.
@@ -527,14 +536,17 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
     TII->removeModOperands(MI);
     MI.setDesc(TII->get(NewOpcode));
   }
+  return true;
 }
 
 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
 /// XNOR (as a ^ b == ~(a ^ ~b)).
-/// \returns true if the caller should continue the machine function iterator
-bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
+/// \return ChangeKind::None if no changes were made.
+///         ChangeKind::UpdateHint if regalloc hints were updated.
+///         ChangeKind::UpdateInst if the instruction was modified.
+ChangeKind SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
   const MachineOperand *Dest = &MI.getOperand(0);
   MachineOperand *Src0 = &MI.getOperand(1);
@@ -544,13 +556,14 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
 
   if (!SrcImm->isImm() ||
       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
-    return false;
+    return ChangeKind::None;
 
   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
   uint32_t NewImm = 0;
 
   if (Opc == AMDGPU::S_AND_B32) {
-    if (isPowerOf2_32(~Imm)) {
+    if (isPowerOf2_32(~Imm) &&
+        MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) {
       NewImm = llvm::countr_one(Imm);
       Opc = AMDGPU::S_BITSET0_B32;
     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
@@ -558,7 +571,8 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
       Opc = AMDGPU::S_ANDN2_B32;
     }
   } else if (Opc == AMDGPU::S_OR_B32) {
-    if (isPowerOf2_32(Imm)) {
+    if (isPowerOf2_32(Imm) &&
+        MI.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)->isDead()) {
       NewImm = llvm::countr_zero(Imm);
       Opc = AMDGPU::S_BITSET1_B32;
     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
@@ -578,13 +592,13 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
       MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
       MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
-      return true;
+      return ChangeKind::UpdateHint;
     }
 
     if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
       const bool IsUndef = SrcReg->isUndef();
       const bool IsKill = SrcReg->isKill();
-      MI.setDesc(TII->get(Opc));
+      TII->mutateAndCleanupImplicit(MI, TII->get(Opc));
       if (Opc == AMDGPU::S_BITSET0_B32 ||
           Opc == AMDGPU::S_BITSET1_B32) {
         Src0->ChangeToImmediate(NewImm);
@@ -596,10 +610,11 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
       } else {
         SrcImm->setImm(NewImm);
       }
+      return ChangeKind::UpdateInst;
     }
   }
 
-  return false;
+  return ChangeKind::None;
 }
 
 // This is the same as MachineInstr::readsRegister/modifiesRegister except
@@ -791,10 +806,10 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
         Y1 = getSubRegForIndex(Y, Ysub, I);
         auto *MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
                             TII->get(AMDGPU::V_SWAP_B32))
-                        .addDef(X1.Reg, 0, X1.SubReg)
-                        .addDef(Y1.Reg, 0, Y1.SubReg)
-                        .addReg(Y1.Reg, 0, Y1.SubReg)
-                        .addReg(X1.Reg, 0, X1.SubReg)
+                        .addDef(X1.Reg, {}, X1.SubReg)
+                        .addDef(Y1.Reg, {}, Y1.SubReg)
+                        .addReg(Y1.Reg, {}, Y1.SubReg)
+                        .addReg(X1.Reg, {}, X1.SubReg)
                         .getInstr();
         Swaps.push_back(MIB);
       }
@@ -854,6 +869,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
   IsPostRA = MF.getProperties().hasNoVRegs();
 
   unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+  bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator I, Next;
@@ -877,6 +893,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
           if (ModOpcode != 0) {
             MI.setDesc(TII->get(ModOpcode));
             Src.setImm(static_cast<int64_t>(ModImm));
+            Changed = true;
             continue;
           }
         }
@@ -887,20 +904,35 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
                             MI.getOpcode() == AMDGPU::COPY)) {
         if (auto *NextMI = matchSwap(MI)) {
           Next = NextMI->getIterator();
+          Changed = true;
           continue;
         }
       }
 
+      // Shrink scalar logic operations.
+      if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+          MI.getOpcode() == AMDGPU::S_OR_B32 ||
+          MI.getOpcode() == AMDGPU::S_XOR_B32) {
+        ChangeKind CK = shrinkScalarLogicOp(MI);
+        if (CK == ChangeKind::UpdateHint)
+          continue;
+        Changed |= (CK == ChangeKind::UpdateInst);
+      }
+
       // Try to use S_ADDK_I32 and S_MULK_I32.
       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
-          MI.getOpcode() == AMDGPU::S_MUL_I32) {
+          MI.getOpcode() == AMDGPU::S_MUL_I32 ||
+          (MI.getOpcode() == AMDGPU::S_OR_B32 &&
+           MI.getFlag(MachineInstr::MIFlag::Disjoint))) {
         const MachineOperand *Dest = &MI.getOperand(0);
         MachineOperand *Src0 = &MI.getOperand(1);
         MachineOperand *Src1 = &MI.getOperand(2);
 
         if (!Src0->isReg() && Src1->isReg()) {
-          if (TII->commuteInstruction(MI, false, 1, 2))
+          if (TII->commuteInstruction(MI, false, 1, 2)) {
             std::swap(Src0, Src1);
+            Changed = true;
+          }
         }
 
         // FIXME: This could work better if hints worked with subregisters. If
@@ -911,22 +943,22 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
           MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
           continue;
         }
-
         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
           if (Src1->isImm() && isKImmOperand(*Src1)) {
-            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
-              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
-
+            unsigned Opc = (MI.getOpcode() == AMDGPU::S_MUL_I32)
+                               ? AMDGPU::S_MULK_I32
+                               : AMDGPU::S_ADDK_I32;
             Src1->setImm(SignExtend64(Src1->getImm(), 32));
             MI.setDesc(TII->get(Opc));
             MI.tieOperands(0, 1);
+            Changed = true;
           }
         }
       }
 
       // Try to use s_cmpk_*
       if (MI.isCompare() && TII->isSOPC(MI)) {
-        shrinkScalarCompare(MI);
+        Changed |= shrinkScalarCompare(MI);
         continue;
       }
 
@@ -941,27 +973,21 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
           if (isKImmOperand(Src)) {
             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
             Src.setImm(SignExtend64(Src.getImm(), 32));
+            Changed = true;
           } else if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm,
                                                         /*Scalar=*/true))) {
             MI.setDesc(TII->get(ModOpc));
             Src.setImm(static_cast<int64_t>(ModImm));
+            Changed = true;
           }
         }
 
         continue;
       }
 
-      // Shrink scalar logic operations.
-      if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
-          MI.getOpcode() == AMDGPU::S_OR_B32 ||
-          MI.getOpcode() == AMDGPU::S_XOR_B32) {
-        if (shrinkScalarLogicOp(MI))
-          continue;
-      }
-
       if (IsPostRA && TII->isMIMG(MI.getOpcode()) &&
           ST->getGeneration() >= AMDGPUSubtarget::GFX10) {
-        shrinkMIMG(MI);
+        Changed |= shrinkMIMG(MI);
         continue;
       }
 
@@ -977,14 +1003,14 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
           MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 ||
           (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 &&
            ST->hasFmaakFmamkF64Insts())) {
-        shrinkMadFma(MI);
+        Changed |= shrinkMadFma(MI);
         continue;
       }
 
       // If there is no chance we will shrink it and use VCC as sdst to get
       // a 32 bit form try to replace dead sdst with NULL.
       if (TII->isVOP3(MI.getOpcode())) {
-        tryReplaceDeadSDST(MI);
+        Changed |= tryReplaceDeadSDST(MI);
         if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
           continue;
         }
@@ -995,9 +1021,12 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
         // it.
         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
             !TII->canShrink(MI, *MRI)) {
-          tryReplaceDeadSDST(MI);
+          Changed |= tryReplaceDeadSDST(MI);
           continue;
         }
+
+        // Operands were commuted.
+        Changed = true;
       }
 
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
@@ -1101,9 +1130,10 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
       foldImmediates(*Inst32);
 
       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
+      Changed = true;
     }
   }
-  return false;
+  return Changed;
 }
 
 bool SIShrinkInstructionsLegacy::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 6611e1e..5fd0c1e 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -188,8 +188,9 @@ private:
 
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
-  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
-                unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
+  void markDefs(const MachineInstr &UseMI, LiveRange &LR,
+                VirtRegOrUnit VRegOrUnit, unsigned SubReg, char Flag,
+                std::vector<WorkItem> &Worklist);
   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
                    std::vector<WorkItem> &Worklist);
   void markInstructionUses(const MachineInstr &MI, char Flag,
@@ -318,8 +319,8 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 
 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
-                               Register Reg, unsigned SubReg, char Flag,
-                               std::vector<WorkItem> &Worklist) {
+                               VirtRegOrUnit VRegOrUnit, unsigned SubReg,
+                               char Flag, std::vector<WorkItem> &Worklist) {
   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
 
   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
@@ -331,8 +332,9 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
   // cover registers.
   const LaneBitmask UseLanes =
       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
-             : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
-                                : LaneBitmask::getNone());
+             : (VRegOrUnit.isVirtualReg()
+                    ? MRI->getMaxLaneMaskForVReg(VRegOrUnit.asVirtualReg())
+                    : LaneBitmask::getNone());
 
   // Perform a depth-first iteration of the LiveRange graph marking defs.
   // Stop processing of a given branch when all use lanes have been defined.
@@ -382,11 +384,11 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
       assert(MI && "Def has no defining instruction");
 
-      if (Reg.isVirtual()) {
+      if (VRegOrUnit.isVirtualReg()) {
         // Iterate over all operands to find relevant definitions
         bool HasDef = false;
         for (const MachineOperand &Op : MI->all_defs()) {
-          if (Op.getReg() != Reg)
+          if (Op.getReg() != VRegOrUnit.asVirtualReg())
             continue;
 
           // Compute lanes defined and overlap with use
@@ -453,7 +455,7 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
                     << " for " << MI);
   if (Reg.isVirtual()) {
     LiveRange &LR = LIS->getInterval(Reg);
-    markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
+    markDefs(MI, LR, VirtRegOrUnit(Reg), Op.getSubReg(), Flag, Worklist);
   } else {
     // Handle physical registers that we need to track; this is mostly relevant
     // for VCC, which can appear as the (implicit) input of a uniform branch,
@@ -462,7 +464,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
       LiveRange &LR = LIS->getRegUnit(Unit);
       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
       if (Value)
-        markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+        markDefs(MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
+                 Worklist);
     }
   }
 }
@@ -1101,10 +1104,15 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
   LiveRange &LR =
       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
   auto MBBE = MBB.end();
-  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
-                                     : LIS->getMBBEndIdx(&MBB);
-  SlotIndex LastIdx =
-      Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
+  // Skip debug instructions when getting slot indices, as they don't have
+  // entries in the slot index map.
+  auto FirstNonDbg = skipDebugInstructionsForward(First, MBBE);
+  auto LastNonDbg = skipDebugInstructionsForward(Last, MBBE);
+  SlotIndex FirstIdx = FirstNonDbg != MBBE
+                           ? LIS->getInstructionIndex(*FirstNonDbg)
+                           : LIS->getMBBEndIdx(&MBB);
+  SlotIndex LastIdx = LastNonDbg != MBBE ? LIS->getInstructionIndex(*LastNonDbg)
+                                         : LIS->getMBBEndIdx(&MBB);
   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
   const LiveRange::Segment *S;
 
@@ -1121,8 +1129,8 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
     } else {
       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
       assert(EndMI && "Segment does not end on valid instruction");
-      auto NextI = std::next(EndMI->getIterator());
-      if (NextI == MBB.end())
+      auto NextI = next_nodbg(EndMI->getIterator(), MBB.instr_end());
+      if (NextI == MBB.instr_end())
         break;
       SlotIndex Next = LIS->getInstructionIndex(*NextI);
       if (Next > LastIdx)
@@ -1176,16 +1184,17 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
     }
   }
 
+  const DebugLoc &DL = MBB.findDebugLoc(Before);
   MachineInstr *MI;
 
   if (SaveWQM) {
     unsigned Opcode =
         IsTerminator ? LMC.AndSaveExecTermOpc : LMC.AndSaveExecOpc;
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
-             .addReg(LiveMaskReg);
+    MI =
+        BuildMI(MBB, Before, DL, TII->get(Opcode), SaveWQM).addReg(LiveMaskReg);
   } else {
     unsigned Opcode = IsTerminator ? LMC.AndTermOpc : LMC.AndOpc;
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), LMC.ExecReg)
+    MI = BuildMI(MBB, Before, DL, TII->get(Opcode), LMC.ExecReg)
              .addReg(LMC.ExecReg)
              .addReg(LiveMaskReg);
   }
@@ -1197,13 +1206,14 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator Before,
                             Register SavedWQM) {
+  const DebugLoc &DL = MBB.findDebugLoc(Before);
   MachineInstr *MI;
 
   if (SavedWQM) {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), LMC.ExecReg)
+    MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::COPY), LMC.ExecReg)
              .addReg(SavedWQM);
   } else {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(LMC.WQMOpc), LMC.ExecReg)
+    MI = BuildMI(MBB, Before, DL, TII->get(LMC.WQMOpc), LMC.ExecReg)
              .addReg(LMC.ExecReg);
   }
 
@@ -1219,13 +1229,13 @@ void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
   assert(StrictStateNeeded == StateStrictWWM ||
          StrictStateNeeded == StateStrictWQM);
 
+  const DebugLoc &DL = MBB.findDebugLoc(Before);
+
   if (StrictStateNeeded == StateStrictWWM) {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
-                 SaveOrig)
+    MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WWM), SaveOrig)
              .addImm(-1);
   } else {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
-                 SaveOrig)
+    MI = BuildMI(MBB, Before, DL, TII->get(AMDGPU::ENTER_STRICT_WQM), SaveOrig)
              .addImm(-1);
   }
   LIS->InsertMachineInstrInMaps(*MI);
@@ -1242,14 +1252,16 @@ void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
   assert(CurrentStrictState == StateStrictWWM ||
          CurrentStrictState == StateStrictWQM);
 
+  const DebugLoc &DL = MBB.findDebugLoc(Before);
+
   if (CurrentStrictState == StateStrictWWM) {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
-                 LMC.ExecReg)
-             .addReg(SavedOrig);
+    MI =
+        BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WWM), LMC.ExecReg)
+            .addReg(SavedOrig);
   } else {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
-                 LMC.ExecReg)
-             .addReg(SavedOrig);
+    MI =
+        BuildMI(MBB, Before, DL, TII->get(AMDGPU::EXIT_STRICT_WQM), LMC.ExecReg)
+            .addReg(SavedOrig);
   }
   LIS->InsertMachineInstrInMaps(*MI);
   StateTransition[MI] = NonStrictState;
@@ -1629,7 +1641,7 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
   }
 
   // Insert instruction sequence at block beginning (before vector operations).
-  const DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
   const unsigned WavefrontSize = ST->getWavefrontSize();
   const unsigned Mask = (WavefrontSize << 1) - 1;
   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 781c61b0..ee8d29c 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1464,7 +1464,7 @@ class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName,
 class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
     SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12,
                         SGPR_NULL_gfx11plus> {
-  let AssemblerPredicate = isGFX12Plus;
+  let AssemblerPredicate = isGFX12Only;
   let DecoderNamespace = "GFX12";
 
   let Inst{5-0}   = !if(ps.has_sbase, sbase{6-1}, ?);
@@ -1537,3 +1537,84 @@ multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
 
 defm S_ATC_PROBE        : SMEM_Real_Probe_gfx12<0x22>;
 defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>;
+
+//===----------------------------------------------------------------------===//
+// GFX13.
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_gfx13<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
+    SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX13,
+                        SGPR_NULL_gfx11plus> {
+  let AssemblerPredicate = isGFX13Plus;
+  let DecoderNamespace = "GFX13";
+
+  let Inst{5-0}   = !if(ps.has_sbase, sbase{6-1}, ?);
+  let Inst{12-6}  = !if(ps.has_sdst, sdst{6-0}, ?);
+}
+
+class SMEM_Real_Prefetch_gfx13<bits<6> op, SM_Pseudo ps> :
+    SMEM_Real_gfx13<op, ps> {
+  bits<7> sdata; // Only 5 bits of sdata are supported.
+
+  let sdst = ?;
+  let Inst{12-11} = 0; // Unused sdata bits.
+  let Inst{10-6}  = !if(ps.has_sdst, sdata{4-0}, ?);
+}
+
+class SMEM_Real_Load_gfx13<bits<6> op, string ps, string opName, OffsetMode offsets> :
+    SMEM_Real_gfx13<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
+  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+  let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+
+  let Inst{20} = cpol{CPolBit.NV}; // non-volatile
+  let Inst{22-21} = cpol{4-3}; // scope
+  let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+  let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
+}
+
+multiclass SM_Real_Loads_gfx13<bits<6> op, string ps = NAME> {
+  defvar opName = !tolower(NAME);
+  def _IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, IMM_Offset>;
+  def _SGPR_IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, SGPR_IMM_OptOffset>;
+}
+
+defm S_LOAD_B32  : SM_Real_Loads_gfx13<0x00, "S_LOAD_DWORD">;
+defm S_LOAD_B64  : SM_Real_Loads_gfx13<0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_B96  : SM_Real_Loads_gfx13<0x0e, "S_LOAD_DWORDX3">;
+defm S_LOAD_B128 : SM_Real_Loads_gfx13<0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_B256 : SM_Real_Loads_gfx13<0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_B512 : SM_Real_Loads_gfx13<0x04, "S_LOAD_DWORDX16">;
+
+defm S_LOAD_I8   : SM_Real_Loads_gfx13<0x30>;
+defm S_LOAD_U8   : SM_Real_Loads_gfx13<0x31>;
+defm S_LOAD_I16  : SM_Real_Loads_gfx13<0x32>;
+defm S_LOAD_U16  : SM_Real_Loads_gfx13<0x33>;
+
+defm S_BUFFER_LOAD_B32  : SM_Real_Loads_gfx13<0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_B64  : SM_Real_Loads_gfx13<0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_B96  : SM_Real_Loads_gfx13<0x0d, "S_BUFFER_LOAD_DWORDX3">;
+defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx13<0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx13<0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx13<0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+defm S_BUFFER_LOAD_I8  : SM_Real_Loads_gfx13<0x34>;
+defm S_BUFFER_LOAD_U8  : SM_Real_Loads_gfx13<0x35>;
+defm S_BUFFER_LOAD_I16 : SM_Real_Loads_gfx13<0x36>;
+defm S_BUFFER_LOAD_U16 : SM_Real_Loads_gfx13<0x37>;
+
+def S_DCACHE_INV_gfx13 : SMEM_Real_gfx13<0x020, S_DCACHE_INV>;
+
+def S_PREFETCH_INST_gfx13        : SMEM_Real_Prefetch_gfx13<0x22, S_PREFETCH_INST>;
+def S_PREFETCH_INST_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x23, S_PREFETCH_INST_PC_REL>;
+def S_PREFETCH_DATA_gfx13        : SMEM_Real_Prefetch_gfx13<0x2c, S_PREFETCH_DATA>;
+def S_BUFFER_PREFETCH_DATA_gfx13 : SMEM_Real_Prefetch_gfx13<0x2d, S_BUFFER_PREFETCH_DATA>;
+def S_PREFETCH_DATA_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x2e, S_PREFETCH_DATA_PC_REL>;
+
+multiclass SMEM_Real_Probe_gfx13<bits<6> op> {
+  defvar ps = NAME;
+  def _IMM_gfx13      : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+  def _SGPR_IMM_gfx13 : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_OPT_IMM)>;
+}
+
+defm S_ATC_PROBE        : SMEM_Real_Probe_gfx13<0x26>;
+defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx13<0x27>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b6..ce6e862 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in {
 } // End SubtargetPredicate = isGFX11Plus
 
 let SubtargetPredicate = isGFX12Plus in {
-  let hasSideEffects = 1, Defs = [SCC] in {
-    def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
+  let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in {
+    def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr",
+      [(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))]
+    >;
   }
 } // End SubtargetPredicate = isGFX12Plus
 
@@ -469,6 +471,25 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
 } // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
   // SchedRW = [WriteSFPU], isReMaterializable = 1
 
+let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in {
+    // Fallback patterns for f32->i16 conversion.
+    def : GCNPat<(i16 (UniformUnaryFrag<fp_to_sint> f32:$src0)),
+                 (S_CVT_I32_F32 $src0)>;
+    def : GCNPat<(i16 (UniformUnaryFrag<fp_to_uint> f32:$src0)),
+                 (S_CVT_U32_F32 $src0)>;
+    // f16 -> i32 : form chain f16 -> f32 -> i32
+    def : GCNPat<(i32 (UniformUnaryFrag<fp_to_sint> f16:$src0)),
+                 (S_CVT_I32_F32 (S_CVT_F32_F16 $src0))>;
+    def : GCNPat<(i32 (UniformUnaryFrag<fp_to_uint> f16:$src0)),
+                 (S_CVT_U32_F32 (S_CVT_F32_F16 $src0))>;
+
+    // i32 -> f16 : form chain i32 -> f32 -> f16
+    def : GCNPat<(f16 (UniformUnaryFrag<sint_to_fp> i32:$src0)),
+                 (S_CVT_F16_F32 (S_CVT_F32_I32 $src0))>;
+    def : GCNPat<(f16 (UniformUnaryFrag<uint_to_fp> i32:$src0)),
+                 (S_CVT_F16_F32 (S_CVT_F32_U32 $src0))>;
+}
+
 let hasSideEffects = 1 in {
 let has_sdst = 0 in {
 let Uses = [M0] in {
@@ -504,6 +525,12 @@ def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
   let isConvergent = 1;
 }
 
+def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins),
+  "", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+  let SubtargetPredicate = HasSWakeupBarrier;
+}
 } // End Uses = [M0]
 
 def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -527,6 +554,12 @@ def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
   let isConvergent = 1;
 }
 
+def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
+  (ins SplitBarrier:$src0), "$src0", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+  let SubtargetPredicate = HasSWakeupBarrier;
+}
 } // End has_sdst = 0
 
 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -838,9 +871,10 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo <
   let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
 }
 
-let Defs = [SCC] in {
-def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
-} // End Defs = [SCC]
+let isCommutable = 1, Defs = [SCC] in
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32",
+  [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))]
+>;
 
 let SubtargetPredicate = isGFX8GFX9 in {
   def S_RFE_RESTORE_B64 : SOP2_Pseudo <
@@ -1618,23 +1652,34 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
   let isConvergent = 1;
 }
 
+
+let SchedRW = [WriteBarrier], isConvergent = 1 in {
+  let SubtargetPredicate = isGFX12Only in
   def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave",
     (ins), "", [(int_amdgcn_s_barrier_leave (i16 srcvalue))] > {
-  let SchedRW = [WriteBarrier];
-  let simm16 = 0;
-  let fixed_imm = 1;
-  let isConvergent = 1;
-  let Defs = [SCC];
+    let simm16 = 0;
+    let fixed_imm = 1;
+    let Defs = [SCC];
+  }
+
+  let SubtargetPredicate = HasSBarrierLeaveImm in
+  def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
+    (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
 }
 
 def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
-  let SubtargetPredicate = isGFX8Plus;
+  let SubtargetPredicate = isGFX8GFX9GFX10GFX11GFX12;
   let simm16 = 0;
   let fixed_imm = 1;
   let mayLoad = 1;
   let mayStore = 1;
 }
 
+let SubtargetPredicate = HasSWakeupImm in {
+  def S_WAKEUP_imm : SOPP_Pseudo <"s_wakeup",
+    (ins i16imm:$simm16), "$simm16">;
+} // End SubtargetPredicate = HasSWakeupImm
+
 let SubtargetPredicate = isNotGFX1250Plus in {
 def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
     [(int_amdgcn_s_waitcnt timm:$simm16)]>;
@@ -1667,11 +1712,21 @@ let SubtargetPredicate = HasWaitXcnt in {
 
 // Represents the point at which a wave must wait for all outstanding direct loads to LDS.
 // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
-
 def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
    let hasSideEffects = 0;
 }
 
+let SubtargetPredicate = HasVMemToLDSLoad in {
+def ASYNCMARK : SPseudoInstSI<(outs), (ins),
+  [(int_amdgcn_asyncmark)]> {
+   let maybeAtomic = 0;
+}
+def WAIT_ASYNCMARK : SOPP_Pseudo <"", (ins s16imm:$simm16), "$simm16",
+    [(int_amdgcn_wait_asyncmark timm:$simm16)]> {
+    let maybeAtomic = 0;
+}
+}
+
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
     [(int_amdgcn_s_sethalt timm:$simm16)]>;
 def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
@@ -1791,8 +1846,8 @@ let SubtargetPredicate = isGFX10Plus in {
 
 let SubtargetPredicate = isGFX11Plus in {
 let OtherPredicates = [HasExportInsts] in
-  def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16),
-                                 "$simm16"> {
+  def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins WaitEvent:$simm16),
+                                 "$simm16", [(int_amdgcn_s_wait_event timm:$simm16)]> {
                                    let hasSideEffects = 1;
                                  }
   def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16),
@@ -1915,9 +1970,7 @@ def : GCNPat<
   (S_SEXT_I32_I16 $src)
 >;
 
-let SubtargetPredicate = isNotGFX12Plus in
-  def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>;
-let SubtargetPredicate = isGFX12Plus in
+let SubtargetPredicate = isGFX11Plus in
   def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 2))>;
 
 // The first 10 bits of the mode register are the core FP mode on all
@@ -2091,7 +2144,34 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
 }
 
 //===----------------------------------------------------------------------===//
-//  SOP1 - GFX11, GFX12
+//  SOP1 - GFX13
+//===----------------------------------------------------------------------===//
+
+multiclass SOP1_Real_gfx13<bits<8> op, string name = !tolower(NAME)> {
+  defvar ps = !cast<SOP1_Pseudo>(NAME);
+  def _gfx13 : SOP1_Real<op, ps, name>,
+               Select<GFX13Gen, ps.Mnemonic>;
+  if !ne(ps.Mnemonic, name) then
+    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
+}
+
+multiclass SOP1_M0_Real_gfx13<bits<8> op> {
+  defvar ps = !cast<SOP1_Pseudo>(NAME);
+  def _gfx13 : SOP1_Real<op, ps>, Select<GFX13Gen, ps.PseudoInstr> {
+    let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0
+  }
+}
+
+multiclass SOP1_IMM_Real_gfx13<bits<8> op> {
+  defvar ps = !cast<SOP1_Pseudo>(NAME);
+  def _gfx13 : SOP1_Real<op, ps>,
+               Select<GFX13Gen, ps.PseudoInstr>;
+}
+
+defm S_GET_SHADER_CYCLES_U64 : SOP1_Real_gfx13<0x011>;
+
+//===----------------------------------------------------------------------===//
+//  SOP1 - GFX11, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
 multiclass SOP1_Real_gfx11<bits<8> op, string name = !tolower(NAME)> {
@@ -2110,23 +2190,29 @@ multiclass SOP1_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
                Select<GFX12Gen, ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
-      let AssemblerPredicate = isGFX12Plus;
+      let AssemblerPredicate = isGFX12Only;
     }
 }
 
 multiclass SOP1_M0_Real_gfx12<bits<8> op> {
-  def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
-               Select<GFX12Gen, !cast<SOP1_Pseudo>(NAME).PseudoInstr> {
+  defvar ps = !cast<SOP1_Pseudo>(NAME);
+  def _gfx12 : SOP1_Real<op, ps>, Select<GFX12Gen, ps.PseudoInstr> {
     let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0
   }
 }
 
+multiclass SOP1_M0_Real_gfx12_gfx13<bits<8> op> :
+  SOP1_M0_Real_gfx12<op>, SOP1_M0_Real_gfx13<op>;
+
 multiclass SOP1_IMM_Real_gfx12<bits<8> op> {
   defvar ps = !cast<SOP1_Pseudo>(NAME);
   def _gfx12 : SOP1_Real<op, ps>,
                Select<GFX12Gen, ps.PseudoInstr>;
 }
 
+multiclass SOP1_IMM_Real_gfx12_gfx13<bits<8> op> :
+  SOP1_IMM_Real_gfx12<op>, SOP1_IMM_Real_gfx13<op>;
+
 multiclass SOP1_Real_gfx11_gfx12<bits<8> op, string name = !tolower(NAME)> :
   SOP1_Real_gfx11<op, name>, SOP1_Real_gfx12<op, name>;
 
@@ -2139,6 +2225,12 @@ multiclass SOP1_Real_gfx1250<bits<8> op, string name = !tolower(NAME)> {
     def : AMDGPUMnemonicAlias<ps.Mnemonic, name>;
 }
 
+multiclass SOP1_Real_gfx11_gfx12_gfx13<bits<8> op> :
+  SOP1_Real_gfx11<op>, SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>;
+
+multiclass SOP1_Real_gfx12_gfx13<bits<8> op> :
+  SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>;
+
 defm S_MOV_B32                    : SOP1_Real_gfx11_gfx12<0x000>;
 defm S_MOV_B64                    : SOP1_Real_gfx11_gfx12<0x001>;
 defm S_CMOV_B32                   : SOP1_Real_gfx11_gfx12<0x002>;
@@ -2207,47 +2299,49 @@ defm S_GETPC_B64                  : SOP1_Real_gfx1250<0x047, "s_get_pc_i64">;
 defm S_SETPC_B64                  : SOP1_Real_gfx1250<0x048, "s_set_pc_i64">;
 defm S_SWAPPC_B64                 : SOP1_Real_gfx1250<0x049, "s_swap_pc_i64">;
 defm S_RFE_B64                    : SOP1_Real_gfx1250<0x04a, "s_rfe_i64">;
-defm S_SENDMSG_RTN_B32            : SOP1_Real_gfx11_gfx12<0x04c>;
-defm S_SENDMSG_RTN_B64            : SOP1_Real_gfx11_gfx12<0x04d>;
-defm S_BARRIER_SIGNAL_M0          : SOP1_M0_Real_gfx12<0x04e>;
-defm S_BARRIER_SIGNAL_ISFIRST_M0  : SOP1_M0_Real_gfx12<0x04f>;
-defm S_GET_BARRIER_STATE_M0       : SOP1_M0_Real_gfx12<0x050>;
-defm S_BARRIER_INIT_M0            : SOP1_M0_Real_gfx12<0x051>;
-defm S_BARRIER_JOIN_M0            : SOP1_M0_Real_gfx12<0x052>;
-defm S_BARRIER_SIGNAL_IMM         : SOP1_IMM_Real_gfx12<0x04e>;
-defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
-defm S_GET_BARRIER_STATE_IMM      : SOP1_IMM_Real_gfx12<0x050>;
-defm S_BARRIER_INIT_IMM           : SOP1_IMM_Real_gfx12<0x051>;
-defm S_BARRIER_JOIN_IMM           : SOP1_IMM_Real_gfx12<0x052>;
-defm S_ALLOC_VGPR                 : SOP1_Real_gfx12<0x053>;
-defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12<0x058>;
-
-// GFX1250
+defm S_SENDMSG_RTN_B32            : SOP1_Real_gfx11_gfx12_gfx13<0x04c>;
+defm S_SENDMSG_RTN_B64            : SOP1_Real_gfx11_gfx12_gfx13<0x04d>;
+defm S_BARRIER_SIGNAL_M0          : SOP1_M0_Real_gfx12_gfx13<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_M0  : SOP1_M0_Real_gfx12_gfx13<0x04f>;
+defm S_GET_BARRIER_STATE_M0       : SOP1_M0_Real_gfx12_gfx13<0x050>;
+defm S_BARRIER_INIT_M0            : SOP1_M0_Real_gfx12_gfx13<0x051>;
+defm S_BARRIER_JOIN_M0            : SOP1_M0_Real_gfx12_gfx13<0x052>;
+defm S_BARRIER_SIGNAL_IMM         : SOP1_IMM_Real_gfx12_gfx13<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12_gfx13<0x04f>;
+defm S_GET_BARRIER_STATE_IMM      : SOP1_IMM_Real_gfx12_gfx13<0x050>;
+defm S_BARRIER_INIT_IMM           : SOP1_IMM_Real_gfx12_gfx13<0x051>;
+defm S_BARRIER_JOIN_IMM           : SOP1_IMM_Real_gfx12_gfx13<0x052>;
+defm S_ALLOC_VGPR                 : SOP1_Real_gfx12_gfx13<0x053>;
+defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12_gfx13<0x058>;
+
+// GFX1250, GFX13
 defm S_GET_SHADER_CYCLES_U64      : SOP1_Real_gfx12<0x06>;
-defm S_ADD_PC_I64                 : SOP1_Real_gfx12<0x04b>;
+defm S_ADD_PC_I64                 : SOP1_Real_gfx12_gfx13<0x04b>;
+defm S_WAKEUP_BARRIER_M0          : SOP1_M0_Real_gfx12_gfx13<0x057>;
+defm S_WAKEUP_BARRIER_IMM         : SOP1_IMM_Real_gfx12_gfx13<0x057>;
 
 //===----------------------------------------------------------------------===//
-// SOP1 - GFX1150, GFX12
+// SOP1 - GFX1150, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
-defm S_CEIL_F32          : SOP1_Real_gfx11_gfx12<0x060>;
-defm S_FLOOR_F32         : SOP1_Real_gfx11_gfx12<0x061>;
-defm S_TRUNC_F32         : SOP1_Real_gfx11_gfx12<0x062>;
-defm S_RNDNE_F32         : SOP1_Real_gfx11_gfx12<0x063>;
-defm S_CVT_F32_I32       : SOP1_Real_gfx11_gfx12<0x064>;
-defm S_CVT_F32_U32       : SOP1_Real_gfx11_gfx12<0x065>;
-defm S_CVT_I32_F32       : SOP1_Real_gfx11_gfx12<0x066>;
-defm S_CVT_U32_F32       : SOP1_Real_gfx11_gfx12<0x067>;
-defm S_CVT_F16_F32       : SOP1_Real_gfx11_gfx12<0x068>;
-defm S_CVT_F32_F16       : SOP1_Real_gfx11_gfx12<0x069>;
-defm S_CVT_HI_F32_F16    : SOP1_Real_gfx11_gfx12<0x06a>;
-defm S_CEIL_F16          : SOP1_Real_gfx11_gfx12<0x06b>;
-defm S_FLOOR_F16         : SOP1_Real_gfx11_gfx12<0x06c>;
-defm S_TRUNC_F16         : SOP1_Real_gfx11_gfx12<0x06d>;
-defm S_RNDNE_F16         : SOP1_Real_gfx11_gfx12<0x06e>;
+defm S_CEIL_F32          : SOP1_Real_gfx11_gfx12_gfx13<0x060>;
+defm S_FLOOR_F32         : SOP1_Real_gfx11_gfx12_gfx13<0x061>;
+defm S_TRUNC_F32         : SOP1_Real_gfx11_gfx12_gfx13<0x062>;
+defm S_RNDNE_F32         : SOP1_Real_gfx11_gfx12_gfx13<0x063>;
+defm S_CVT_F32_I32       : SOP1_Real_gfx11_gfx12_gfx13<0x064>;
+defm S_CVT_F32_U32       : SOP1_Real_gfx11_gfx12_gfx13<0x065>;
+defm S_CVT_I32_F32       : SOP1_Real_gfx11_gfx12_gfx13<0x066>;
+defm S_CVT_U32_F32       : SOP1_Real_gfx11_gfx12_gfx13<0x067>;
+defm S_CVT_F16_F32       : SOP1_Real_gfx11_gfx12_gfx13<0x068>;
+defm S_CVT_F32_F16       : SOP1_Real_gfx11_gfx12_gfx13<0x069>;
+defm S_CVT_HI_F32_F16    : SOP1_Real_gfx11_gfx12_gfx13<0x06a>;
+defm S_CEIL_F16          : SOP1_Real_gfx11_gfx12_gfx13<0x06b>;
+defm S_FLOOR_F16         : SOP1_Real_gfx11_gfx12_gfx13<0x06c>;
+defm S_TRUNC_F16         : SOP1_Real_gfx11_gfx12_gfx13<0x06d>;
+defm S_RNDNE_F16         : SOP1_Real_gfx11_gfx12_gfx13<0x06e>;
 
 //===----------------------------------------------------------------------===//
-// SOP1 - GFX10.
+// SOP1 - GFX10, GFX13
 //===----------------------------------------------------------------------===//
 
 multiclass SOP1_Real_gfx10<bits<8> op> {
@@ -2256,30 +2350,33 @@ multiclass SOP1_Real_gfx10<bits<8> op> {
                Select<GFX10Gen, ps.PseudoInstr>;
 }
 
-multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> :
-  SOP1_Real_gfx10<op>, SOP1_Real_gfx11_gfx12<op>;
+multiclass SOP1_Real_gfx10_gfx13<bits<8> op> :
+  SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op>;
 
-defm S_ANDN1_SAVEEXEC_B64   : SOP1_Real_gfx10<0x037>;
-defm S_ORN1_SAVEEXEC_B64    : SOP1_Real_gfx10<0x038>;
-defm S_ANDN1_WREXEC_B64     : SOP1_Real_gfx10<0x039>;
-defm S_ANDN2_WREXEC_B64     : SOP1_Real_gfx10<0x03a>;
-defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10<0x03b>;
-defm S_AND_SAVEEXEC_B32     : SOP1_Real_gfx10<0x03c>;
-defm S_OR_SAVEEXEC_B32      : SOP1_Real_gfx10<0x03d>;
-defm S_XOR_SAVEEXEC_B32     : SOP1_Real_gfx10<0x03e>;
-defm S_ANDN2_SAVEEXEC_B32   : SOP1_Real_gfx10<0x03f>;
-defm S_ORN2_SAVEEXEC_B32    : SOP1_Real_gfx10<0x040>;
-defm S_NAND_SAVEEXEC_B32    : SOP1_Real_gfx10<0x041>;
-defm S_NOR_SAVEEXEC_B32     : SOP1_Real_gfx10<0x042>;
-defm S_XNOR_SAVEEXEC_B32    : SOP1_Real_gfx10<0x043>;
-defm S_ANDN1_SAVEEXEC_B32   : SOP1_Real_gfx10<0x044>;
-defm S_ORN1_SAVEEXEC_B32    : SOP1_Real_gfx10<0x045>;
-defm S_ANDN1_WREXEC_B32     : SOP1_Real_gfx10<0x046>;
-defm S_ANDN2_WREXEC_B32     : SOP1_Real_gfx10<0x047>;
-defm S_MOVRELSD_2_B32       : SOP1_Real_gfx10<0x049>;
+multiclass SOP1_Real_gfx10_Renamed_gfx13<bits<8> op, string gfx13_name> :
+  SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op, gfx13_name>;
+
+defm S_ANDN1_SAVEEXEC_B64   : SOP1_Real_gfx10_Renamed_gfx13<0x37, "s_and_not0_saveexec_b64">;
+defm S_ORN1_SAVEEXEC_B64    : SOP1_Real_gfx10_Renamed_gfx13<0x38, "s_or_not0_saveexec_b64">;
+defm S_ANDN1_WREXEC_B64     : SOP1_Real_gfx10_Renamed_gfx13<0x39, "s_and_not0_wrexec_b64">;
+defm S_ANDN2_WREXEC_B64     : SOP1_Real_gfx10_Renamed_gfx13<0x3a, "s_and_not1_wrexec_b64">;
+defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx10_gfx13<0x03b>;
+defm S_AND_SAVEEXEC_B32     : SOP1_Real_gfx10_gfx13<0x03c>;
+defm S_OR_SAVEEXEC_B32      : SOP1_Real_gfx10_gfx13<0x03d>;
+defm S_XOR_SAVEEXEC_B32     : SOP1_Real_gfx10_gfx13<0x03e>;
+defm S_ANDN2_SAVEEXEC_B32   : SOP1_Real_gfx10_Renamed_gfx13<0x3f, "s_and_not1_saveexec_b32">;
+defm S_ORN2_SAVEEXEC_B32    : SOP1_Real_gfx10_Renamed_gfx13<0x40, "s_or_not1_saveexec_b32">;
+defm S_NAND_SAVEEXEC_B32    : SOP1_Real_gfx10_gfx13<0x041>;
+defm S_NOR_SAVEEXEC_B32     : SOP1_Real_gfx10_gfx13<0x042>;
+defm S_XNOR_SAVEEXEC_B32    : SOP1_Real_gfx10_gfx13<0x043>;
+defm S_ANDN1_SAVEEXEC_B32   : SOP1_Real_gfx10_Renamed_gfx13<0x44, "s_and_not0_saveexec_b32">;
+defm S_ORN1_SAVEEXEC_B32    : SOP1_Real_gfx10_Renamed_gfx13<0x45, "s_or_not0_saveexec_b32">;
+defm S_ANDN1_WREXEC_B32     : SOP1_Real_gfx10_Renamed_gfx13<0x46, "s_and_not0_wrexec_b32">;
+defm S_ANDN2_WREXEC_B32     : SOP1_Real_gfx10_Renamed_gfx13<0x47, "s_and_not1_wrexec_b32">;
+defm S_MOVRELSD_2_B32       : SOP1_Real_gfx10_gfx13<0x049>;
 
 //===----------------------------------------------------------------------===//
-// SOP1 - GFX6, GFX7, GFX10, GFX11.
+// SOP1 - GFX6, GFX7, GFX10, GFX11, GFX13
 //===----------------------------------------------------------------------===//
 
 
@@ -2292,61 +2389,82 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
 multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
   SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
 
-multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
-  SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11_gfx12<op>;
+multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx13<bits<8> op> :
+  SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op>;
+
+multiclass SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<bits<8> op, string gfx13_name> :
+  SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx13<op, gfx13_name>;
+
+multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<8> op> :
+  SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>,
+  SOP1_Real_gfx12<op>, SOP1_Real_gfx13<op>;
 
 defm S_CBRANCH_JOIN  : SOP1_Real_gfx6_gfx7<0x032>;
 
-defm S_MOV_B32            : SOP1_Real_gfx6_gfx7_gfx10<0x003>;
-defm S_MOV_B64            : SOP1_Real_gfx6_gfx7_gfx10<0x004>;
-defm S_CMOV_B32           : SOP1_Real_gfx6_gfx7_gfx10<0x005>;
-defm S_CMOV_B64           : SOP1_Real_gfx6_gfx7_gfx10<0x006>;
-defm S_NOT_B32            : SOP1_Real_gfx6_gfx7_gfx10<0x007>;
-defm S_NOT_B64            : SOP1_Real_gfx6_gfx7_gfx10<0x008>;
-defm S_WQM_B32            : SOP1_Real_gfx6_gfx7_gfx10<0x009>;
-defm S_WQM_B64            : SOP1_Real_gfx6_gfx7_gfx10<0x00a>;
-defm S_BREV_B32           : SOP1_Real_gfx6_gfx7_gfx10<0x00b>;
-defm S_BREV_B64           : SOP1_Real_gfx6_gfx7_gfx10<0x00c>;
-defm S_BCNT0_I32_B32      : SOP1_Real_gfx6_gfx7_gfx10<0x00d>;
-defm S_BCNT0_I32_B64      : SOP1_Real_gfx6_gfx7_gfx10<0x00e>;
-defm S_BCNT1_I32_B32      : SOP1_Real_gfx6_gfx7_gfx10<0x00f>;
-defm S_BCNT1_I32_B64      : SOP1_Real_gfx6_gfx7_gfx10<0x010>;
+defm S_MOV_B32            : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x003>;
+defm S_MOV_B64            : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x004>;
+defm S_CMOV_B32           : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x005>;
+defm S_CMOV_B64           : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x006>;
+defm S_NOT_B32            : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x007>;
+defm S_NOT_B64            : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x008>;
+defm S_WQM_B32            : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x009>;
+defm S_WQM_B64            : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00a>;
+defm S_BREV_B32           : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00b>;
+defm S_BREV_B64           : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00c>;
+defm S_BCNT0_I32_B32      : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00d>;
+defm S_BCNT0_I32_B64      : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00e>;
+defm S_BCNT1_I32_B32      : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x00f>;
+defm S_BCNT1_I32_B64      : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x010>;
 defm S_FF0_I32_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x011>;
 defm S_FF0_I32_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x012>;
-defm S_FF1_I32_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x013>;
-defm S_FF1_I32_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x014>;
-defm S_FLBIT_I32_B32      : SOP1_Real_gfx6_gfx7_gfx10<0x015>;
-defm S_FLBIT_I32_B64      : SOP1_Real_gfx6_gfx7_gfx10<0x016>;
-defm S_FLBIT_I32          : SOP1_Real_gfx6_gfx7_gfx10<0x017>;
-defm S_FLBIT_I32_I64      : SOP1_Real_gfx6_gfx7_gfx10<0x018>;
-defm S_SEXT_I32_I8        : SOP1_Real_gfx6_gfx7_gfx10<0x019>;
-defm S_SEXT_I32_I16       : SOP1_Real_gfx6_gfx7_gfx10<0x01a>;
-defm S_BITSET0_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x01b>;
-defm S_BITSET0_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x01c>;
-defm S_BITSET1_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x01d>;
-defm S_BITSET1_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x01e>;
-defm S_GETPC_B64          : SOP1_Real_gfx6_gfx7_gfx10<0x01f>;
-defm S_SETPC_B64          : SOP1_Real_gfx6_gfx7_gfx10<0x020>;
-defm S_SWAPPC_B64         : SOP1_Real_gfx6_gfx7_gfx10<0x021>;
-defm S_RFE_B64            : SOP1_Real_gfx6_gfx7_gfx10<0x022>;
-defm S_AND_SAVEEXEC_B64   : SOP1_Real_gfx6_gfx7_gfx10<0x024>;
-defm S_OR_SAVEEXEC_B64    : SOP1_Real_gfx6_gfx7_gfx10<0x025>;
-defm S_XOR_SAVEEXEC_B64   : SOP1_Real_gfx6_gfx7_gfx10<0x026>;
-defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>;
-defm S_ORN2_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10<0x028>;
-defm S_NAND_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10<0x029>;
-defm S_NOR_SAVEEXEC_B64   : SOP1_Real_gfx6_gfx7_gfx10<0x02a>;
-defm S_XNOR_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>;
-defm S_QUADMASK_B32       : SOP1_Real_gfx6_gfx7_gfx10<0x02c>;
-defm S_QUADMASK_B64       : SOP1_Real_gfx6_gfx7_gfx10<0x02d>;
-defm S_MOVRELS_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x02e>;
-defm S_MOVRELS_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x02f>;
-defm S_MOVRELD_B32        : SOP1_Real_gfx6_gfx7_gfx10<0x030>;
-defm S_MOVRELD_B64        : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
-defm S_ABS_I32            : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
-
-//===----------------------------------------------------------------------===//
-// SOP2 - GFX12
+defm S_FF1_I32_B32        : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x013, "s_ctz_i32_b32">;
+defm S_FF1_I32_B64        : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x014, "s_ctz_i32_b64">;
+defm S_FLBIT_I32_B32      : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x015, "s_clz_i32_u32">;
+defm S_FLBIT_I32_B64      : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x016, "s_clz_i32_u64">;
+defm S_FLBIT_I32          : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x017, "s_cls_i32">;
+defm S_FLBIT_I32_I64      : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x018, "s_cls_i32_i64">;
+defm S_SEXT_I32_I8        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x019>;
+defm S_SEXT_I32_I16       : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01a>;
+defm S_BITSET0_B32        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01b>;
+defm S_BITSET0_B64        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01c>;
+defm S_BITSET1_B32        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01d>;
+defm S_BITSET1_B64        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x01e>;
+defm S_GETPC_B64          : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x01f, "s_get_pc_i64">;
+defm S_SETPC_B64          : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x020, "s_set_pc_i64">;
+defm S_SWAPPC_B64         : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x021, "s_swap_pc_i64">;
+defm S_RFE_B64            : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x022, "s_rfe_i64">;
+defm S_AND_SAVEEXEC_B64   : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x024>;
+defm S_OR_SAVEEXEC_B64    : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x025>;
+defm S_XOR_SAVEEXEC_B64   : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x026>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x027, "s_and_not1_saveexec_b64">;
+defm S_ORN2_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x028, "s_or_not1_saveexec_b64">;
+defm S_NAND_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x029>;
+defm S_NOR_SAVEEXEC_B64   : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02a>;
+defm S_XNOR_SAVEEXEC_B64  : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x02b>;
+defm S_QUADMASK_B32       : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02c>;
+defm S_QUADMASK_B64       : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02d>;
+defm S_MOVRELS_B32        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02e>;
+defm S_MOVRELS_B64        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x02f>;
+defm S_MOVRELD_B32        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x030>;
+defm S_MOVRELD_B64        : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x031>;
+defm S_ABS_I32            : SOP1_Real_gfx6_gfx7_gfx10_gfx13<0x034>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_gfx13<bits<7> op, string name = !tolower(NAME)> {
+  defvar ps = !cast<SOP2_Pseudo>(NAME);
+  def _gfx13 : SOP2_Real32<op, ps, name>,
+               Select<GFX13Gen, ps.Mnemonic>;
+  if !ne(ps.Mnemonic, name) then
+    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
+}
+
+defm S_PACK_HL_B32_B16 : SOP2_Real_gfx13<0x37>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
 multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> {
@@ -2355,17 +2473,23 @@ multiclass SOP2_Real_gfx12<bits<7> op, string name = !tolower(NAME)> {
                Select<GFX12Gen, ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
-      let AssemblerPredicate = isGFX12Plus;
+      let AssemblerPredicate = isGFX12Only;
     }
 }
 
-defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>;
-defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>;
-defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>;
-defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;
+multiclass SOP2_Real_gfx12_gfx13<bits<7> op, string name = !tolower(NAME)> :
+  SOP2_Real_gfx12<op, name>, SOP2_Real_gfx13<op, name>;
+
+defm S_MINIMUM_F32 : SOP2_Real_gfx12_gfx13<0x04f>;
+defm S_MAXIMUM_F32 : SOP2_Real_gfx12_gfx13<0x050>;
+defm S_MINIMUM_F16 : SOP2_Real_gfx12_gfx13<0x051>;
+defm S_MAXIMUM_F16 : SOP2_Real_gfx12_gfx13<0x052>;
+defm S_ADD_U64     : SOP2_Real_gfx12_gfx13<0x053, "s_add_nc_u64">;
+defm S_SUB_U64     : SOP2_Real_gfx12_gfx13<0x054, "s_sub_nc_u64">;
+defm S_MUL_U64     : SOP2_Real_gfx12_gfx13<0x055>;
 
 //===----------------------------------------------------------------------===//
-// SOP2 - GFX11, GFX12.
+// SOP2 - GFX11, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
 multiclass SOP2_Real_gfx11<bits<7> op, string name = !tolower(NAME)> {
@@ -2424,14 +2548,19 @@ defm S_MUL_HI_I32      : SOP2_Real_gfx11_gfx12<0x02e>;
 defm S_CSELECT_B32     : SOP2_Real_gfx11_gfx12<0x030>;
 defm S_CSELECT_B64     : SOP2_Real_gfx11_gfx12<0x031>;
 defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11_gfx12<0x035>;
-defm S_ADD_U64         : SOP2_Real_gfx12<0x053, "s_add_nc_u64">;
-defm S_SUB_U64         : SOP2_Real_gfx12<0x054, "s_sub_nc_u64">;
-defm S_MUL_U64         : SOP2_Real_gfx12<0x055>;
 
 //===----------------------------------------------------------------------===//
-// SOP2 - GFX1150, GFX12
+// SOP2 - GFX1150, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
+multiclass SOP2_Real_gfx11_gfx12_gfx13<bits<7> op> :
+  SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>, SOP2_Real_gfx13<op>;
+
+multiclass SOP2_Real_FMAK_gfx13<bits<7> op> {
+  def _gfx13 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
+               Select<GFX13Gen, !cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
 multiclass SOP2_Real_FMAK_gfx12<bits<7> op> {
   def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
                Select<GFX12Gen, !cast<SOP2_Pseudo>(NAME).PseudoInstr>;
@@ -2442,35 +2571,36 @@ multiclass SOP2_Real_FMAK_gfx11<bits<7> op> {
                Select<GFX11Gen, !cast<SOP2_Pseudo>(NAME).PseudoInstr>;
 }
 
-multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> :
-  SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>;
+multiclass SOP2_Real_FMAK_gfx11_gfx12_gfx13<bits<7> op> :
+  SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>, SOP2_Real_FMAK_gfx13<op>;
 
-defm S_ADD_F32            : SOP2_Real_gfx11_gfx12<0x040>;
-defm S_SUB_F32            : SOP2_Real_gfx11_gfx12<0x041>;
-defm S_MUL_F32            : SOP2_Real_gfx11_gfx12<0x044>;
-defm S_FMAAK_F32          : SOP2_Real_FMAK_gfx11_gfx12<0x045>;
-defm S_FMAMK_F32          : SOP2_Real_FMAK_gfx11_gfx12<0x046>;
-defm S_FMAC_F32           : SOP2_Real_gfx11_gfx12<0x047>;
-defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12<0x048>;
-defm S_ADD_F16            : SOP2_Real_gfx11_gfx12<0x049>;
-defm S_SUB_F16            : SOP2_Real_gfx11_gfx12<0x04a>;
-defm S_MUL_F16            : SOP2_Real_gfx11_gfx12<0x04d>;
-defm S_FMAC_F16           : SOP2_Real_gfx11_gfx12<0x04e>;
+defm S_ADD_F32            : SOP2_Real_gfx11_gfx12_gfx13<0x040>;
+defm S_SUB_F32            : SOP2_Real_gfx11_gfx12_gfx13<0x041>;
+defm S_MUL_F32            : SOP2_Real_gfx11_gfx12_gfx13<0x044>;
+defm S_FMAAK_F32          : SOP2_Real_FMAK_gfx11_gfx12_gfx13<0x045>;
+defm S_FMAMK_F32          : SOP2_Real_FMAK_gfx11_gfx12_gfx13<0x046>;
+defm S_FMAC_F32           : SOP2_Real_gfx11_gfx12_gfx13<0x047>;
+defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12_gfx13<0x048>;
+defm S_ADD_F16            : SOP2_Real_gfx11_gfx12_gfx13<0x049>;
+defm S_SUB_F16            : SOP2_Real_gfx11_gfx12_gfx13<0x04a>;
+defm S_MUL_F16            : SOP2_Real_gfx11_gfx12_gfx13<0x04d>;
+defm S_FMAC_F16           : SOP2_Real_gfx11_gfx12_gfx13<0x04e>;
 
 //===----------------------------------------------------------------------===//
-// SOP2 - GFX1150
+// SOP2 - GFX1150, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
-multiclass SOP2_Real_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
-  SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op, gfx12_name>;
+multiclass SOP2_Real_gfx11_Renamed_gfx12_gfx13<bits<7> op, string gfx12_gfx13_name> :
+  SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op, gfx12_gfx13_name>,
+  SOP2_Real_gfx13<op, gfx12_gfx13_name>;
 
-defm S_MIN_F32 : SOP2_Real_gfx11_Renamed_gfx12<0x042, "s_min_num_f32">;
-defm S_MAX_F32 : SOP2_Real_gfx11_Renamed_gfx12<0x043, "s_max_num_f32">;
-defm S_MIN_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04b, "s_min_num_f16">;
-defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12<0x04c, "s_max_num_f16">;
+defm S_MIN_F32 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x042, "s_min_num_f32">;
+defm S_MAX_F32 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x043, "s_max_num_f32">;
+defm S_MIN_F16 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x04b, "s_min_num_f16">;
+defm S_MAX_F16 : SOP2_Real_gfx11_Renamed_gfx12_gfx13<0x04c, "s_max_num_f16">;
 
 //===----------------------------------------------------------------------===//
-// SOP2 - GFX10.
+// SOP2 - GFX10, GFX11, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
 multiclass SOP2_Real_gfx10<bits<7> op> {
@@ -2479,21 +2609,25 @@ multiclass SOP2_Real_gfx10<bits<7> op> {
                Select<GFX10Gen, ps.PseudoInstr>;
 }
 
-multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> :
-  SOP2_Real_gfx10<op>, SOP2_Real_gfx11_gfx12<op>;
+multiclass SOP2_Real_gfx10_gfx13<bits<7> op> :
+  SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op>;
 
-defm S_LSHL1_ADD_U32   : SOP2_Real_gfx10<0x02e>;
-defm S_LSHL2_ADD_U32   : SOP2_Real_gfx10<0x02f>;
-defm S_LSHL3_ADD_U32   : SOP2_Real_gfx10<0x030>;
-defm S_LSHL4_ADD_U32   : SOP2_Real_gfx10<0x031>;
-defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x032>;
-defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x033>;
-defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x034>;
-defm S_MUL_HI_U32      : SOP2_Real_gfx10<0x035>;
-defm S_MUL_HI_I32      : SOP2_Real_gfx10<0x036>;
+multiclass SOP2_Real_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
+  SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>,
+  SOP2_Real_gfx13<op>;
+
+defm S_LSHL1_ADD_U32   : SOP2_Real_gfx10_gfx13<0x02e>;
+defm S_LSHL2_ADD_U32   : SOP2_Real_gfx10_gfx13<0x02f>;
+defm S_LSHL3_ADD_U32   : SOP2_Real_gfx10_gfx13<0x030>;
+defm S_LSHL4_ADD_U32   : SOP2_Real_gfx10_gfx13<0x031>;
+defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x032>;
+defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x033>;
+defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12_gfx13<0x034>;
+defm S_MUL_HI_U32      : SOP2_Real_gfx10_gfx13<0x035>;
+defm S_MUL_HI_I32      : SOP2_Real_gfx10_gfx13<0x036>;
 
 //===----------------------------------------------------------------------===//
-// SOP2 - GFX6, GFX7.
+// SOP2 - GFX6, GFX7, GFX10, GFX11, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
 multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
@@ -2502,57 +2636,105 @@ multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
                    Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
-multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
-  SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>;
+multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx13<bits<7> op> :
+  SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op>;
+
+multiclass SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<bits<7> op, string gfx13_name> :
+  SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx13<op, gfx13_name>;
 
-multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
+multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<bits<7> op, string gfx12_gfx13_name> :
   SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>,
-  SOP2_Real_gfx12<op, gfx12_name>;
+  SOP2_Real_gfx12<op, gfx12_gfx13_name>, SOP2_Real_gfx13<op, gfx12_gfx13_name>;
 
 defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>;
 
-defm S_ADD_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x000, "s_add_co_u32">;
-defm S_SUB_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x001, "s_sub_co_u32">;
-defm S_ADD_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x002, "s_add_co_i32">;
-defm S_SUB_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x003, "s_sub_co_i32">;
-defm S_ADDC_U32    : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x004, "s_add_co_ci_u32">;
-defm S_SUBB_U32    : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x005, "s_sub_co_ci_u32">;
-defm S_MIN_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x006>;
-defm S_MIN_U32     : SOP2_Real_gfx6_gfx7_gfx10<0x007>;
-defm S_MAX_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x008>;
-defm S_MAX_U32     : SOP2_Real_gfx6_gfx7_gfx10<0x009>;
-defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10<0x00a>;
-defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10<0x00b>;
-defm S_AND_B32     : SOP2_Real_gfx6_gfx7_gfx10<0x00e>;
-defm S_AND_B64     : SOP2_Real_gfx6_gfx7_gfx10<0x00f>;
-defm S_OR_B32      : SOP2_Real_gfx6_gfx7_gfx10<0x010>;
-defm S_OR_B64      : SOP2_Real_gfx6_gfx7_gfx10<0x011>;
-defm S_XOR_B32     : SOP2_Real_gfx6_gfx7_gfx10<0x012>;
-defm S_XOR_B64     : SOP2_Real_gfx6_gfx7_gfx10<0x013>;
-defm S_ANDN2_B32   : SOP2_Real_gfx6_gfx7_gfx10<0x014>;
-defm S_ANDN2_B64   : SOP2_Real_gfx6_gfx7_gfx10<0x015>;
-defm S_ORN2_B32    : SOP2_Real_gfx6_gfx7_gfx10<0x016>;
-defm S_ORN2_B64    : SOP2_Real_gfx6_gfx7_gfx10<0x017>;
-defm S_NAND_B32    : SOP2_Real_gfx6_gfx7_gfx10<0x018>;
-defm S_NAND_B64    : SOP2_Real_gfx6_gfx7_gfx10<0x019>;
-defm S_NOR_B32     : SOP2_Real_gfx6_gfx7_gfx10<0x01a>;
-defm S_NOR_B64     : SOP2_Real_gfx6_gfx7_gfx10<0x01b>;
-defm S_XNOR_B32    : SOP2_Real_gfx6_gfx7_gfx10<0x01c>;
-defm S_XNOR_B64    : SOP2_Real_gfx6_gfx7_gfx10<0x01d>;
-defm S_LSHL_B32    : SOP2_Real_gfx6_gfx7_gfx10<0x01e>;
-defm S_LSHL_B64    : SOP2_Real_gfx6_gfx7_gfx10<0x01f>;
-defm S_LSHR_B32    : SOP2_Real_gfx6_gfx7_gfx10<0x020>;
-defm S_LSHR_B64    : SOP2_Real_gfx6_gfx7_gfx10<0x021>;
-defm S_ASHR_I32    : SOP2_Real_gfx6_gfx7_gfx10<0x022>;
-defm S_ASHR_I64    : SOP2_Real_gfx6_gfx7_gfx10<0x023>;
-defm S_BFM_B32     : SOP2_Real_gfx6_gfx7_gfx10<0x024>;
-defm S_BFM_B64     : SOP2_Real_gfx6_gfx7_gfx10<0x025>;
-defm S_MUL_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x026>;
-defm S_BFE_U32     : SOP2_Real_gfx6_gfx7_gfx10<0x027>;
-defm S_BFE_I32     : SOP2_Real_gfx6_gfx7_gfx10<0x028>;
-defm S_BFE_U64     : SOP2_Real_gfx6_gfx7_gfx10<0x029>;
-defm S_BFE_I64     : SOP2_Real_gfx6_gfx7_gfx10<0x02a>;
-defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
+defm S_ADD_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x000, "s_add_co_u32">;
+defm S_SUB_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x001, "s_sub_co_u32">;
+defm S_ADD_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x002, "s_add_co_i32">;
+defm S_SUB_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x003, "s_sub_co_i32">;
+defm S_ADDC_U32    : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x004, "s_add_co_ci_u32">;
+defm S_SUBB_U32    : SOP2_Real_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x005, "s_sub_co_ci_u32">;
+defm S_MIN_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x006>;
+defm S_MIN_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x007>;
+defm S_MAX_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x008>;
+defm S_MAX_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x009>;
+defm S_CSELECT_B32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00a>;
+defm S_CSELECT_B64 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00b>;
+defm S_AND_B32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00e>;
+defm S_AND_B64     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x00f>;
+defm S_OR_B32      : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x010>;
+defm S_OR_B64      : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x011>;
+defm S_XOR_B32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x012>;
+defm S_XOR_B64     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x013>;
+defm S_ANDN2_B32   : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x014, "s_and_not1_b32">;
+defm S_ANDN2_B64   : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x015, "s_and_not1_b64">;
+defm S_ORN2_B32    : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x016, "s_or_not1_b32">;
+defm S_ORN2_B64    : SOP2_Real_gfx6_gfx7_gfx10_Renamed_gfx13<0x017, "s_or_not1_b64">;
+defm S_NAND_B32    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x018>;
+defm S_NAND_B64    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x019>;
+defm S_NOR_B32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01a>;
+defm S_NOR_B64     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01b>;
+defm S_XNOR_B32    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01c>;
+defm S_XNOR_B64    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01d>;
+defm S_LSHL_B32    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01e>;
+defm S_LSHL_B64    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x01f>;
+defm S_LSHR_B32    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x020>;
+defm S_LSHR_B64    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x021>;
+defm S_ASHR_I32    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x022>;
+defm S_ASHR_I64    : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x023>;
+defm S_BFM_B32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x024>;
+defm S_BFM_B64     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x025>;
+defm S_MUL_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x026>;
+defm S_BFE_U32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x027>;
+defm S_BFE_I32     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x028>;
+defm S_BFE_U64     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x029>;
+defm S_BFE_I64     : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x02a>;
+defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx13<0x02c>;
+
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX10 Only
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx10<bits<5> op> {
+  defvar ps = !cast<SOPK_Pseudo>(NAME);
+  def _gfx10 : SOPK_Real32<op, ps>,
+               Select<GFX10Gen, ps.PseudoInstr>;
+}
+
+multiclass SOPK_Real64_gfx10<bits<5> op> {
+  defvar ps = !cast<SOPK_Pseudo>(NAME);
+  def _gfx10 : SOPK_Real64<op, ps>,
+               Select<GFX10Gen, ps.PseudoInstr>;
+}
+
+defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx10<0x017>;
+defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx10<0x018>;
+defm S_WAITCNT_EXPCNT       : SOPK_Real32_gfx10<0x019>;
+defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx10<0x01a>;
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>;
+defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx10<0x01c>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX11 Only
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx11<bits<5> op> {
+  def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+               Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
+}
+
+multiclass SOPK_Real64_gfx11<bits<5> op> {
+  def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+               Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
+}
+
+defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
+defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx11<0x017>;
+defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx11<0x018>;
+defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx11<0x019>;
+defm S_WAITCNT_EXPCNT       : SOPK_Real32_gfx11<0x01a>;
+defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx11<0x01b>;
 
 //===----------------------------------------------------------------------===//
 // SOPK - GFX11, GFX12.
@@ -2568,21 +2750,11 @@ multiclass SOPK_Real32_gfx12<bits<5> op, string name = !tolower(NAME)> {
     }
 }
 
-multiclass SOPK_Real32_gfx11<bits<5> op> {
-  def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
-               Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
-}
-
 multiclass SOPK_Real64_gfx12<bits<5> op> {
   def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
                Select<GFX12Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
 }
 
-multiclass SOPK_Real64_gfx11<bits<5> op> {
-  def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
-               Select<GFX11Gen, !cast<SOPK_Pseudo>(NAME).PseudoInstr>;
-}
-
 multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> :
   SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>;
 
@@ -2604,43 +2776,39 @@ defm S_SETREG_IMM32_B32     : SOPK_Real64_gfx11_gfx12<0x013>;
 let OtherPredicates = [isNotGFX1250Plus] in
 defm S_CALL_B64             : SOPK_Real32_gfx11_gfx12<0x014>;
 defm S_CALL_B64             : SOPK_Real32_gfx1250<0x014, "s_call_i64">;
-defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
-defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx11<0x017>;
-defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx11<0x018>;
-defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx11<0x019>;
-defm S_WAITCNT_EXPCNT       : SOPK_Real32_gfx11<0x01a>;
-defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx11<0x01b>;
 
 //===----------------------------------------------------------------------===//
-// SOPK - GFX10.
+// SOPK - GFX10, GFX11, GFX12, GFX13.
 //===----------------------------------------------------------------------===//
 
-multiclass SOPK_Real32_gfx10<bits<5> op> {
+multiclass SOPK_Real32_gfx13<bits<5> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOPK_Pseudo>(NAME);
-  def _gfx10 : SOPK_Real32<op, ps>,
-               Select<GFX10Gen, ps.PseudoInstr>;
+  def _gfx13 : SOPK_Real32<op, ps, name>,
+               Select<GFX13Gen, ps.Mnemonic>;
+  if !ne(ps.Mnemonic, name) then
+    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
 }
 
-multiclass SOPK_Real64_gfx10<bits<5> op> {
+multiclass SOPK_Real64_gfx13<bits<5> op> {
   defvar ps = !cast<SOPK_Pseudo>(NAME);
-  def _gfx10 : SOPK_Real64<op, ps>,
-               Select<GFX10Gen, ps.PseudoInstr>;
+  def _gfx13 : SOPK_Real64<op, ps>,
+               Select<GFX13Gen, ps.Mnemonic>;
 }
 
-multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> :
-  SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>;
+multiclass SOPK_Real32_gfx10_gfx11_gfx12_gfx13<bits<5> op> :
+  SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>,
+  SOPK_Real32_gfx13<op>;
 
-multiclass SOPK_Real32_gfx10_gfx11_gfx12<bits<5> op> :
-  SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11_gfx12<op>;
+defm S_VERSION              : SOPK_Real32_gfx10_gfx11_gfx12_gfx13<0x001>;
 
-defm S_VERSION              : SOPK_Real32_gfx10_gfx11_gfx12<0x001>;
-defm S_CALL_B64             : SOPK_Real32_gfx10<0x016>;
-defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx10<0x017>;
-defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx10<0x018>;
-defm S_WAITCNT_EXPCNT       : SOPK_Real32_gfx10<0x019>;
-defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx10<0x01a>;
-defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx10<0x01b>;
-defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx10<0x01c>;
+//===----------------------------------------------------------------------===//
+// SOPK - GFX10, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx10_Renamed_gfx13<bits<5> op, string gfx13_name> :
+  SOPK_Real32_gfx10<op>, SOPK_Real32_gfx13<op, gfx13_name>;
+
+defm S_CALL_B64             : SOPK_Real32_gfx10_Renamed_gfx13<0x016, "s_call_i64">;
 
 //===----------------------------------------------------------------------===//
 // SOPK - GFX6, GFX7.
@@ -2652,32 +2820,15 @@ multiclass SOPK_Real32_gfx6_gfx7<bits<5> op> {
                    Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
-multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
-  defvar ps = !cast<SOPK_Pseudo>(NAME);
-  def _gfx6_gfx7 : SOPK_Real64<op, ps>,
-                   Select_gfx6_gfx7<ps.PseudoInstr>;
-}
-
-multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> :
-  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>;
+defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
 
-multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> :
-  SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>;
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7, GFX10, GFX11.
+//===----------------------------------------------------------------------===//
 
 multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> :
-  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>;
+  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>;
 
-multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<bits<5> op> :
-  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11_gfx12<op>;
-
-multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<bits<5> op, string gfx12_name> :
-  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>,
-  SOPK_Real32_gfx12<op, gfx12_name>;
-
-defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
-
-defm S_MOVK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>;
-defm S_CMOVK_I32        : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>;
 defm S_CMPK_EQ_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>;
 defm S_CMPK_LG_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>;
 defm S_CMPK_GT_I32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>;
@@ -2690,11 +2841,71 @@ defm S_CMPK_GT_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00b>;
 defm S_CMPK_GE_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>;
 defm S_CMPK_LT_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>;
 defm S_CMPK_LE_U32      : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>;
-defm S_ADDK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12<0x00f, "s_addk_co_i32">;
-defm S_MULK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x010>;
-defm S_GETREG_B32       : SOPK_Real32_gfx6_gfx7_gfx10<0x012>;
-defm S_SETREG_B32       : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
-defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7, GFX10, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real64_gfx6_gfx7<bits<5> op> {
+  defvar ps = !cast<SOPK_Pseudo>(NAME);
+  def _gfx6_gfx7 : SOPK_Real64<op, ps>,
+                   Select_gfx6_gfx7<ps.PseudoInstr>;
+}
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx13<bits<5> op> :
+  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx13<op>;
+
+multiclass SOPK_Real64_gfx6_gfx7_gfx10_gfx13<bits<5> op> :
+  SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>, SOPK_Real64_gfx13<op>;
+
+defm S_GETREG_B32       : SOPK_Real32_gfx6_gfx7_gfx10_gfx13<0x012>;
+defm S_SETREG_B32       : SOPK_Real32_gfx6_gfx7_gfx10_gfx13<0x013>;
+defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10_gfx13<0x015>;
+
+//===----------------------------------------------------------------------===//
+// SOPK - GFX6, GFX7, GFX10, GFX11, GFX12, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<5> op> :
+  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>,
+  SOPK_Real32_gfx12<op>, SOPK_Real32_gfx13<op>;
+
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<bits<5> op, string gfx12_gfx13_name> :
+  SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>,
+  SOPK_Real32_gfx12<op, gfx12_gfx13_name>, SOPK_Real32_gfx13<op, gfx12_gfx13_name>;
+
+defm S_MOVK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x000>;
+defm S_CMOVK_I32        : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x002>;
+defm S_ADDK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_Renamed_gfx12_gfx13<0x00f, "s_addk_co_i32">;
+defm S_MULK_I32         : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x010>;
+
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX13 only
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx13<bits<7> op, string name = !cast<SOPP_Pseudo>(NAME).Mnemonic, bit compat_alias = 1> {
+  defvar ps = !cast<SOPP_Pseudo>(NAME);
+  def _gfx13 : SOPP_Real_32<op, ps, name>,
+               Select<GFX13Gen, ps.Mnemonic>,
+               SOPPRelaxTable<0, ps.KeyName, "_gfx13">;
+  if !and(compat_alias, !ne(ps.Mnemonic, name)) then
+    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX13Only]>;
+}
+
+multiclass SOPP_Real_64_gfx13<bits<7> op> {
+  defvar ps = !cast<SOPP_Pseudo>(NAME);
+  def _gfx13 : SOPP_Real_64<op, ps, ps.Mnemonic>,
+               Select<GFX13Gen, ps.Mnemonic>,
+               SOPPRelaxTable<1, ps.KeyName, "_gfx13">;
+}
+
+defm S_WAKEUP_imm        : SOPP_Real_32_gfx13<0x003>;
+defm S_BARRIER_WAIT      : SOPP_Real_32_gfx13<0x2b>;
+defm S_MONITOR_SLEEP     : SOPP_Real_32_gfx13<0x2c>;
+defm S_DELAY_ALU         : SOPP_Real_32_gfx13<0x2e>;
+defm S_WAIT_EVENT        : SOPP_Real_32_gfx13<0x2f>;
+defm S_BARRIER_LEAVE_IMM : SOPP_Real_32_gfx13<0x31>;
 
 //===----------------------------------------------------------------------===//
 // SOPP - GFX12 only.
@@ -2706,35 +2917,23 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
                Select<GFX12Gen, ps.PseudoInstr>;
   if !ne(ps.Mnemonic, name) then
     def : AMDGPUMnemonicAlias<ps.Mnemonic, name> {
-      let AssemblerPredicate = isGFX12Plus;
+      let AssemblerPredicate = isGFX12Only;
     }
 }
 
+multiclass SOPP_Real_64_gfx12<bits<7> op> {
+  def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
+               Select<GFX12Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>,
+               SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
+}
+
 defm S_BARRIER_WAIT         : SOPP_Real_32_gfx12<0x014>;
 defm S_BARRIER_LEAVE        : SOPP_Real_32_gfx12<0x015>;
-defm S_WAIT_LOADCNT         : SOPP_Real_32_gfx12<0x040>;
-defm S_WAIT_STORECNT        : SOPP_Real_32_gfx12<0x041>;
-defm S_WAIT_SAMPLECNT       : SOPP_Real_32_gfx12<0x042>;
 defm S_WAIT_BVHCNT          : SOPP_Real_32_gfx12<0x043>;
-defm S_WAIT_EXPCNT          : SOPP_Real_32_gfx12<0x044>;
-defm S_WAIT_DSCNT           : SOPP_Real_32_gfx12<0x046>;
-defm S_WAIT_KMCNT           : SOPP_Real_32_gfx12<0x047>;
-defm S_WAIT_LOADCNT_DSCNT   : SOPP_Real_32_gfx12<0x048>;
-defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12<0x049>;
 
 //===----------------------------------------------------------------------===//
-// SOPP - GFX1250 only.
+// SOPP - GFX11 only.
 //===----------------------------------------------------------------------===//
-defm S_SET_VGPR_MSB   : SOPP_Real_32_gfx12<0x006>;
-defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>;
-defm S_WAIT_XCNT      : SOPP_Real_32_gfx12<0x045>;
-defm S_WAIT_ASYNCCNT  : SOPP_Real_32_gfx12<0x04a>;
-defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12<0x04b>;
-
-//===----------------------------------------------------------------------===//
-// SOPP - GFX11, GFX12.
-//===----------------------------------------------------------------------===//
-
 
 multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
@@ -2747,94 +2946,91 @@ multiclass SOPP_Real_32_gfx11<bits<7> op, string name = !tolower(NAME)> {
     }
 }
 
-multiclass SOPP_Real_64_gfx12<bits<7> op> {
-  def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
-               Select<GFX12Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>,
-               SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
-}
-
 multiclass SOPP_Real_64_gfx11<bits<7> op> {
   def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
                Select<GFX11Gen, !cast<SOPP_Pseudo>(NAME).PseudoInstr>,
                SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
 }
 
-multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> :
-  SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>;
-
-multiclass SOPP_Real_32_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
-  SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op, gfx12_name>;
-
-multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> {
-  defm "" : SOPP_Real_32_gfx12<op>;
-  let isCodeGenOnly = 1 in
-  defm _pad_s_nop : SOPP_Real_64_gfx12<op>;
-}
-
 multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> {
   defm "" : SOPP_Real_32_gfx11<op>;
   let isCodeGenOnly = 1 in
   defm _pad_s_nop : SOPP_Real_64_gfx11<op>;
 }
 
-multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> :
-  SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>;
-
-defm S_SETKILL                    : SOPP_Real_32_gfx11_gfx12<0x001>;
-defm S_SETHALT                    : SOPP_Real_32_gfx11_gfx12<0x002>;
-defm S_SLEEP                      : SOPP_Real_32_gfx11_gfx12<0x003>;
 defm S_INST_PREFETCH              : SOPP_Real_32_gfx11<0x004, "s_set_inst_prefetch_distance">;
-defm S_CLAUSE                     : SOPP_Real_32_gfx11_gfx12<0x005>;
-defm S_DELAY_ALU                  : SOPP_Real_32_gfx11_gfx12<0x007>;
-defm S_WAITCNT_DEPCTR             : SOPP_Real_32_gfx11_Renamed_gfx12<0x008, "s_wait_alu">;
-defm S_WAITCNT                    : SOPP_Real_32_gfx11_gfx12<0x009>;
-defm S_WAIT_IDLE                  : SOPP_Real_32_gfx11_gfx12<0x00a>;
-defm S_WAIT_EVENT                 : SOPP_Real_32_gfx11_gfx12<0x00b>;
-defm S_TRAP                       : SOPP_Real_32_gfx11_gfx12<0x010>;
-defm S_ROUND_MODE                 : SOPP_Real_32_gfx11_gfx12<0x011>;
-defm S_DENORM_MODE                : SOPP_Real_32_gfx11_gfx12<0x012>;
-defm S_BRANCH                     : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>;
-defm S_CBRANCH_SCC0               : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>;
-defm S_CBRANCH_SCC1               : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>;
-defm S_CBRANCH_VCCZ               : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>;
-defm S_CBRANCH_VCCNZ              : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>;
-defm S_CBRANCH_EXECZ              : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>;
-defm S_CBRANCH_EXECNZ             : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>;
 defm S_CBRANCH_CDBGSYS            : SOPP_Real_With_Relaxation_gfx11<0x027>;
 defm S_CBRANCH_CDBGUSER           : SOPP_Real_With_Relaxation_gfx11<0x028>;
 defm S_CBRANCH_CDBGSYS_OR_USER    : SOPP_Real_With_Relaxation_gfx11<0x029>;
 defm S_CBRANCH_CDBGSYS_AND_USER   : SOPP_Real_With_Relaxation_gfx11<0x02a>;
-defm S_ENDPGM                     : SOPP_Real_32_gfx11_gfx12<0x030>;
-defm S_ENDPGM_SAVED               : SOPP_Real_32_gfx11_gfx12<0x031>;
 defm S_ENDPGM_ORDERED_PS_DONE     : SOPP_Real_32_gfx11<0x032>;
-defm S_WAKEUP                     : SOPP_Real_32_gfx11_gfx12<0x034>;
-defm S_SETPRIO                    : SOPP_Real_32_gfx11_gfx12<0x035>;
-defm S_SENDMSG                    : SOPP_Real_32_gfx11_gfx12<0x036>;
-defm S_SENDMSGHALT                : SOPP_Real_32_gfx11_gfx12<0x037>;
-defm S_INCPERFLEVEL               : SOPP_Real_32_gfx11_gfx12<0x038>;
-defm S_DECPERFLEVEL               : SOPP_Real_32_gfx11_gfx12<0x039>;
-defm S_TTRACEDATA                 : SOPP_Real_32_gfx11_gfx12<0x03a>;
-defm S_TTRACEDATA_IMM             : SOPP_Real_32_gfx11_gfx12<0x03b>;
-defm S_ICACHE_INV                 : SOPP_Real_32_gfx11_gfx12<0x03c>;
-
 defm S_BARRIER                    : SOPP_Real_32_gfx11<0x03d>;
 
 //===----------------------------------------------------------------------===//
-// SOPP - GFX1250.
+// SOPP - GFX10 only.
 //===----------------------------------------------------------------------===//
 
-defm S_MONITOR_SLEEP              : SOPP_Real_32_gfx12<0x004>;
+multiclass SOPP_Real_32_gfx10<bits<7> op> {
+  defvar ps = !cast<SOPP_Pseudo>(NAME);
+  def _gfx10 : SOPP_Real_32<op, ps>,
+               Select<GFX10Gen, ps.PseudoInstr>,
+               SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
+}
+
+defm S_INST_PREFETCH            : SOPP_Real_32_gfx10<0x020>;
 
 //===----------------------------------------------------------------------===//
-// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
+// SOPP - GFX12, GFX13.
 //===----------------------------------------------------------------------===//
 
-multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> {
-  defvar ps = !cast<SOPP_Pseudo>(NAME);
-  def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>,
-                   Select_gfx6_gfx7<ps.PseudoInstr>,
-                   SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
-}
+multiclass SOPP_Real_32_gfx12_gfx13<bits<7> op> :
+  SOPP_Real_32_gfx12<op>, SOPP_Real_32_gfx13<op>;
+
+defm S_WAIT_LOADCNT         : SOPP_Real_32_gfx12_gfx13<0x040>;
+defm S_WAIT_STORECNT        : SOPP_Real_32_gfx12_gfx13<0x041>;
+defm S_WAIT_SAMPLECNT       : SOPP_Real_32_gfx12_gfx13<0x042>;
+defm S_WAIT_EXPCNT          : SOPP_Real_32_gfx12_gfx13<0x044>;
+defm S_WAIT_DSCNT           : SOPP_Real_32_gfx12_gfx13<0x046>;
+defm S_WAIT_KMCNT           : SOPP_Real_32_gfx12_gfx13<0x047>;
+defm S_WAIT_LOADCNT_DSCNT   : SOPP_Real_32_gfx12_gfx13<0x048>;
+defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12_gfx13<0x049>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX1250 only.
+//===----------------------------------------------------------------------===//
+
+defm S_MONITOR_SLEEP  : SOPP_Real_32_gfx12<0x004>;
+defm S_SET_VGPR_MSB   : SOPP_Real_32_gfx12<0x006>;
+defm S_WAIT_XCNT      : SOPP_Real_32_gfx12<0x045>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX1250, GFX13
+//===----------------------------------------------------------------------===//
+
+defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12_gfx13<0x03e>;
+defm S_WAIT_ASYNCCNT  : SOPP_Real_32_gfx12_gfx13<0x04a>;
+defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12_gfx13<0x04b>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX10, GFX13
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx10_gfx13<bits<7> op> :
+  SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx13<op>;
+
+multiclass SOPP_Real_32_gfx10_Renamed_gfx13<bits<7> op, string gfx13_name> :
+  SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx13<op, gfx13_name>;
+
+defm S_CLAUSE          : SOPP_Real_32_gfx10_gfx13<0x021>;
+defm S_WAIT_IDLE       : SOPP_Real_32_gfx10_gfx13<0x022>;
+defm S_ROUND_MODE      : SOPP_Real_32_gfx10_gfx13<0x024>;
+defm S_DENORM_MODE     : SOPP_Real_32_gfx10_gfx13<0x025>;
+defm S_TTRACEDATA_IMM  : SOPP_Real_32_gfx10_gfx13<0x028>;
+defm S_WAITCNT_DEPCTR  : SOPP_Real_32_gfx10_Renamed_gfx13<0x023, "s_wait_alu">;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX8, GFX9.
+//===----------------------------------------------------------------------===//
 
 multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
@@ -2843,27 +3039,46 @@ multiclass SOPP_Real_32_gfx8_gfx9<bits<7> op> {
             SOPPRelaxTable<0, ps.KeyName, "_vi">;
 }
 
-multiclass SOPP_Real_32_gfx10<bits<7> op> {
+defm S_SET_GPR_IDX_OFF          : SOPP_Real_32_gfx8_gfx9<0x01c>;
+defm S_SET_GPR_IDX_MODE         : SOPP_Real_32_gfx8_gfx9<0x01d>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx6_gfx7<bits<7> op> {
   defvar ps = !cast<SOPP_Pseudo>(NAME);
-  def _gfx10 : SOPP_Real_32<op, ps>,
-               Select<GFX10Gen, ps.PseudoInstr>,
-               SOPPRelaxTable<0, ps.KeyName, "_gfx10">;
+  def _gfx6_gfx7 : SOPP_Real_32<op, ps, !cast<SOPP_Pseudo>(NAME).Mnemonic>,
+                   Select_gfx6_gfx7<ps.PseudoInstr>,
+                   SOPPRelaxTable<0, ps.KeyName, "_gfx6_gfx7">;
 }
 
-multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op> :
-  SOPP_Real_32_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>;
-
 multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
   SOPP_Real_32_gfx6_gfx7<op>, SOPP_Real_32_gfx8_gfx9<op>;
 
 multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
   SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>;
 
-multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> :
-  SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>;
+defm S_BARRIER                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
+defm S_WAITCNT                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10, GFX11, GFX12, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
+  SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11<op>,
+  SOPP_Real_32_gfx12<op>, SOPP_Real_32_gfx13<op>;
+
+defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x000>;
 
-multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> :
-  SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>;
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> :
+  SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>,
+  SOPP_Real_32_gfx13<op>;
 
 //64 bit encodings, for Relaxation
 multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> {
@@ -2890,6 +3105,44 @@ multiclass SOPP_Real_64_gfx10<bits<7> op> {
 multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
   SOPP_Real_64_gfx6_gfx7<op>, SOPP_Real_64_gfx8_gfx9<op>;
 
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> :
+  SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>,
+  SOPP_Real_64_gfx13<op>;
+
+multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<bits<7> op> {
+  defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<op>;
+  let isCodeGenOnly = 1 in
+  defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<op>;
+}
+
+defm S_ENDPGM                   : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x001>;
+defm S_SETHALT                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00d>;
+defm S_SETKILL                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00b>;
+defm S_SLEEP                    : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00e>;
+defm S_SETPRIO                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x00f>;
+defm S_SENDMSG                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x010>;
+defm S_SENDMSGHALT              : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x011>;
+defm S_TRAP                     : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x012>;
+defm S_ICACHE_INV               : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x013>;
+defm S_INCPERFLEVEL             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x014>;
+defm S_DECPERFLEVEL             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x015>;
+defm S_TTRACEDATA               : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x016>;
+defm S_ENDPGM_SAVED             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x01B>;
+
+let isBranch = 1 in {
+defm S_BRANCH                   : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x002>;
+defm S_CBRANCH_SCC0             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x004>;
+defm S_CBRANCH_SCC1             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x005>;
+defm S_CBRANCH_VCCZ             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x006>;
+defm S_CBRANCH_VCCNZ            : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x007>;
+defm S_CBRANCH_EXECZ            : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x008>;
+defm S_CBRANCH_EXECNZ           : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10_gfx13<0x009>;
+}
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10.
+//===----------------------------------------------------------------------===//
+
 multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
   SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>;
 
@@ -2900,43 +3153,7 @@ multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
   defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
 }
 
-defm S_NOP                      : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x000>;
-defm S_ENDPGM                   : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001>;
-defm S_WAKEUP                   : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
-defm S_BARRIER                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
-defm S_WAITCNT                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00c>;
-defm S_SETHALT                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00d>;
-defm S_SETKILL                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00b>;
-defm S_SLEEP                    : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00e>;
-defm S_SETPRIO                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00f>;
-defm S_SENDMSG                  : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x010>;
-defm S_SENDMSGHALT              : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x011>;
-defm S_TRAP                     : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x012>;
-defm S_ICACHE_INV               : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x013>;
-defm S_INCPERFLEVEL             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x014>;
-defm S_DECPERFLEVEL             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x015>;
-defm S_TTRACEDATA               : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x016>;
-defm S_ENDPGM_SAVED             : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>;
-defm S_SET_GPR_IDX_OFF          : SOPP_Real_32_gfx8_gfx9<0x01c>;
-defm S_SET_GPR_IDX_MODE         : SOPP_Real_32_gfx8_gfx9<0x01d>;
-defm S_ENDPGM_ORDERED_PS_DONE   : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
-defm S_CODE_END                 : SOPP_Real_32_gfx10_gfx11_gfx12<0x01f>;
-defm S_INST_PREFETCH            : SOPP_Real_32_gfx10<0x020>;
-defm S_CLAUSE                   : SOPP_Real_32_gfx10<0x021>;
-defm S_WAIT_IDLE                : SOPP_Real_32_gfx10<0x022>;
-defm S_WAITCNT_DEPCTR           : SOPP_Real_32_gfx10<0x023>;
-defm S_ROUND_MODE               : SOPP_Real_32_gfx10<0x024>;
-defm S_DENORM_MODE              : SOPP_Real_32_gfx10<0x025>;
-defm S_TTRACEDATA_IMM           : SOPP_Real_32_gfx10<0x028>;
-
 let isBranch = 1 in {
-defm S_BRANCH                   : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x002>;
-defm S_CBRANCH_SCC0             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x004>;
-defm S_CBRANCH_SCC1             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x005>;
-defm S_CBRANCH_VCCZ             : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x006>;
-defm S_CBRANCH_VCCNZ            : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x007>;
-defm S_CBRANCH_EXECZ            : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x008>;
-defm S_CBRANCH_EXECNZ           : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x009>;
 defm S_CBRANCH_CDBGSYS          : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x017>;
 defm S_CBRANCH_CDBGUSER         : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x018>;
 defm S_CBRANCH_CDBGSYS_OR_USER  : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<0x019>;
@@ -2944,6 +3161,77 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_
 }
 
 //===----------------------------------------------------------------------===//
+// SOPP - GFX10, GFX11, GFX12, GFX13.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
+  SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>,
+  SOPP_Real_32_gfx13<op>;
+
+defm S_CODE_END                 : SOPP_Real_32_gfx10_gfx11_gfx12_gfx13<0x01f>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX11, GFX12.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> :
+  SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>;
+
+multiclass SOPP_Real_32_gfx11_Renamed_gfx12<bits<7> op, string gfx12_name> :
+  SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op, gfx12_name>;
+
+multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> {
+  defm "" : SOPP_Real_32_gfx12<op>;
+  let isCodeGenOnly = 1 in
+  defm _pad_s_nop : SOPP_Real_64_gfx12<op>;
+}
+
+multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> :
+  SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>;
+
+defm S_SETKILL                    : SOPP_Real_32_gfx11_gfx12<0x001>;
+defm S_SETHALT                    : SOPP_Real_32_gfx11_gfx12<0x002>;
+defm S_SLEEP                      : SOPP_Real_32_gfx11_gfx12<0x003>;
+defm S_CLAUSE                     : SOPP_Real_32_gfx11_gfx12<0x005>;
+defm S_DELAY_ALU                  : SOPP_Real_32_gfx11_gfx12<0x007>;
+defm S_WAITCNT_DEPCTR             : SOPP_Real_32_gfx11_Renamed_gfx12<0x008, "s_wait_alu">;
+defm S_WAITCNT                    : SOPP_Real_32_gfx11_gfx12<0x009>;
+defm S_WAIT_IDLE                  : SOPP_Real_32_gfx11_gfx12<0x00a>;
+defm S_WAIT_EVENT                 : SOPP_Real_32_gfx11_gfx12<0x00b>;
+defm S_TRAP                       : SOPP_Real_32_gfx11_gfx12<0x010>;
+defm S_ROUND_MODE                 : SOPP_Real_32_gfx11_gfx12<0x011>;
+defm S_DENORM_MODE                : SOPP_Real_32_gfx11_gfx12<0x012>;
+defm S_BRANCH                     : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>;
+defm S_CBRANCH_SCC0               : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>;
+defm S_CBRANCH_SCC1               : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>;
+defm S_CBRANCH_VCCZ               : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>;
+defm S_CBRANCH_VCCNZ              : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>;
+defm S_CBRANCH_EXECZ              : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>;
+defm S_CBRANCH_EXECNZ             : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>;
+defm S_ENDPGM                     : SOPP_Real_32_gfx11_gfx12<0x030>;
+defm S_ENDPGM_SAVED               : SOPP_Real_32_gfx11_gfx12<0x031>;
+defm S_WAKEUP                     : SOPP_Real_32_gfx11_gfx12<0x034>;
+defm S_SETPRIO                    : SOPP_Real_32_gfx11_gfx12<0x035>;
+defm S_SENDMSG                    : SOPP_Real_32_gfx11_gfx12<0x036>;
+defm S_SENDMSGHALT                : SOPP_Real_32_gfx11_gfx12<0x037>;
+defm S_INCPERFLEVEL               : SOPP_Real_32_gfx11_gfx12<0x038>;
+defm S_DECPERFLEVEL               : SOPP_Real_32_gfx11_gfx12<0x039>;
+defm S_TTRACEDATA                 : SOPP_Real_32_gfx11_gfx12<0x03a>;
+defm S_TTRACEDATA_IMM             : SOPP_Real_32_gfx11_gfx12<0x03b>;
+defm S_ICACHE_INV                 : SOPP_Real_32_gfx11_gfx12<0x03c>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX8, GFX9, GFX10.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPP_Real_32_gfx8_gfx9_gfx10<bits<7> op> :
+  SOPP_Real_32_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>;
+
+defm S_WAKEUP                   : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
+defm S_ENDPGM_ORDERED_PS_DONE   : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
+
+
+//===----------------------------------------------------------------------===//
 // SOPC - GFX11, GFX12.
 //===----------------------------------------------------------------------===//
 
@@ -2964,41 +3252,61 @@ defm S_CMP_EQ_U64 : SOPC_Real_gfx11_gfx12<0x10>;
 defm S_CMP_LG_U64 : SOPC_Real_gfx11_gfx12<0x11>;
 
 //===----------------------------------------------------------------------===//
-// SOPC - GFX1150, GFX12
+// SOPC - GFX1150, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
-defm S_CMP_LT_F32  : SOPC_Real_gfx11_gfx12<0x41>;
-defm S_CMP_EQ_F32  : SOPC_Real_gfx11_gfx12<0x42>;
-defm S_CMP_LE_F32  : SOPC_Real_gfx11_gfx12<0x43>;
-defm S_CMP_GT_F32  : SOPC_Real_gfx11_gfx12<0x44>;
-defm S_CMP_LG_F32  : SOPC_Real_gfx11_gfx12<0x45>;
-defm S_CMP_GE_F32  : SOPC_Real_gfx11_gfx12<0x46>;
-defm S_CMP_O_F32   : SOPC_Real_gfx11_gfx12<0x47>;
-defm S_CMP_U_F32   : SOPC_Real_gfx11_gfx12<0x48>;
-defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12<0x49>;
-defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12<0x4a>;
-defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12<0x4b>;
-defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12<0x4c>;
-defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12<0x4d>;
-defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12<0x4e>;
+multiclass SOPC_Real_gfx13<bits<7> op> {
+  def _gfx13 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+               Select<GFX13Gen, !cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
 
-defm S_CMP_LT_F16  : SOPC_Real_gfx11_gfx12<0x51>;
-defm S_CMP_EQ_F16  : SOPC_Real_gfx11_gfx12<0x52>;
-defm S_CMP_LE_F16  : SOPC_Real_gfx11_gfx12<0x53>;
-defm S_CMP_GT_F16  : SOPC_Real_gfx11_gfx12<0x54>;
-defm S_CMP_LG_F16  : SOPC_Real_gfx11_gfx12<0x55>;
-defm S_CMP_GE_F16  : SOPC_Real_gfx11_gfx12<0x56>;
-defm S_CMP_O_F16   : SOPC_Real_gfx11_gfx12<0x57>;
-defm S_CMP_U_F16   : SOPC_Real_gfx11_gfx12<0x58>;
-defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12<0x59>;
-defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12<0x5a>;
-defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12<0x5b>;
-defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12<0x5c>;
-defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12<0x5d>;
-defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>;
+multiclass SOPC_Real_gfx11_gfx12_gfx13<bits<7> op> :
+  SOPC_Real_gfx11<op>, SOPC_Real_gfx12<op>, SOPC_Real_gfx13<op>;
+
+defm S_CMP_LT_F32  : SOPC_Real_gfx11_gfx12_gfx13<0x41>;
+defm S_CMP_EQ_F32  : SOPC_Real_gfx11_gfx12_gfx13<0x42>;
+defm S_CMP_LE_F32  : SOPC_Real_gfx11_gfx12_gfx13<0x43>;
+defm S_CMP_GT_F32  : SOPC_Real_gfx11_gfx12_gfx13<0x44>;
+defm S_CMP_LG_F32  : SOPC_Real_gfx11_gfx12_gfx13<0x45>;
+defm S_CMP_GE_F32  : SOPC_Real_gfx11_gfx12_gfx13<0x46>;
+defm S_CMP_O_F32   : SOPC_Real_gfx11_gfx12_gfx13<0x47>;
+defm S_CMP_U_F32   : SOPC_Real_gfx11_gfx12_gfx13<0x48>;
+defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x49>;
+defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4a>;
+defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4b>;
+defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4c>;
+defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4d>;
+defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12_gfx13<0x4e>;
+
+defm S_CMP_LT_F16  : SOPC_Real_gfx11_gfx12_gfx13<0x51>;
+defm S_CMP_EQ_F16  : SOPC_Real_gfx11_gfx12_gfx13<0x52>;
+defm S_CMP_LE_F16  : SOPC_Real_gfx11_gfx12_gfx13<0x53>;
+defm S_CMP_GT_F16  : SOPC_Real_gfx11_gfx12_gfx13<0x54>;
+defm S_CMP_LG_F16  : SOPC_Real_gfx11_gfx12_gfx13<0x55>;
+defm S_CMP_GE_F16  : SOPC_Real_gfx11_gfx12_gfx13<0x56>;
+defm S_CMP_O_F16   : SOPC_Real_gfx11_gfx12_gfx13<0x57>;
+defm S_CMP_U_F16   : SOPC_Real_gfx11_gfx12_gfx13<0x58>;
+defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x59>;
+defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5a>;
+defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5b>;
+defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5c>;
+defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5d>;
+defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12_gfx13<0x5e>;
 
 //===----------------------------------------------------------------------===//
-// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
+// SOPC - GFX8, GFX9.
+//===----------------------------------------------------------------------===//
+
+multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
+  defvar ps = !cast<SOPC_Pseudo>(NAME);
+  def _vi : SOPC_Real<op, ps>,
+            Select_vi<ps.PseudoInstr>;
+}
+
+defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9.
 //===----------------------------------------------------------------------===//
 
 multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
@@ -3007,11 +3315,14 @@ multiclass SOPC_Real_gfx6_gfx7<bits<7> op> {
                    Select_gfx6_gfx7<ps.PseudoInstr>;
 }
 
-multiclass SOPC_Real_gfx8_gfx9<bits<7> op> {
-  defvar ps = !cast<SOPC_Pseudo>(NAME);
-  def _vi : SOPC_Real<op, ps>,
-            Select_vi<ps.PseudoInstr>;
-}
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
+  SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>;
+
+defm S_SETVSKIP       : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10, GFX13
+//===----------------------------------------------------------------------===//
 
 multiclass SOPC_Real_gfx10<bits<7> op> {
   defvar ps = !cast<SOPC_Pseudo>(NAME);
@@ -3019,36 +3330,36 @@ multiclass SOPC_Real_gfx10<bits<7> op> {
                Select<GFX10Gen, ps.PseudoInstr>;
 }
 
-multiclass SOPC_Real_gfx8_gfx9_gfx10<bits<7> op> :
-  SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>;
+multiclass SOPC_Real_gfx8_gfx9_gfx10_gfx13<bits<7> op> :
+  SOPC_Real_gfx8_gfx9<op>, SOPC_Real_gfx10<op>, SOPC_Real_gfx13<op>;
 
-multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
-  SOPC_Real_gfx6_gfx7<op>, SOPC_Real_gfx8_gfx9<op>;
+defm S_CMP_EQ_U64     : SOPC_Real_gfx8_gfx9_gfx10_gfx13<0x12>;
+defm S_CMP_LG_U64     : SOPC_Real_gfx8_gfx9_gfx10_gfx13<0x13>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10, GFX11, GFX12, GFX13
+//===----------------------------------------------------------------------===//
 
-multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> :
+multiclass SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<bits<7> op> :
   SOPC_Real_gfx6_gfx7_gfx8_gfx9<op>, SOPC_Real_gfx10<op>, SOPC_Real_gfx11<op>,
-  SOPC_Real_gfx12<op>;
-
-defm S_CMP_EQ_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x00>;
-defm S_CMP_LG_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x01>;
-defm S_CMP_GT_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x02>;
-defm S_CMP_GE_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x03>;
-defm S_CMP_LT_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x04>;
-defm S_CMP_LE_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x05>;
-defm S_CMP_EQ_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x06>;
-defm S_CMP_LG_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x07>;
-defm S_CMP_GT_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x08>;
-defm S_CMP_GE_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x09>;
-defm S_CMP_LT_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0a>;
-defm S_CMP_LE_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0b>;
-defm S_BITCMP0_B32    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0c>;
-defm S_BITCMP1_B32    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0d>;
-defm S_BITCMP0_B64    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0e>;
-defm S_BITCMP1_B64    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x0f>;
-defm S_SETVSKIP       : SOPC_Real_gfx6_gfx7_gfx8_gfx9<0x10>;
-defm S_SET_GPR_IDX_ON : SOPC_Real_gfx8_gfx9<0x11>;
-defm S_CMP_EQ_U64     : SOPC_Real_gfx8_gfx9_gfx10<0x12>;
-defm S_CMP_LG_U64     : SOPC_Real_gfx8_gfx9_gfx10<0x13>;
+  SOPC_Real_gfx12<op>, SOPC_Real_gfx13<op>;
+
+defm S_CMP_EQ_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x00>;
+defm S_CMP_LG_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x01>;
+defm S_CMP_GT_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x02>;
+defm S_CMP_GE_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x03>;
+defm S_CMP_LT_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x04>;
+defm S_CMP_LE_I32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x05>;
+defm S_CMP_EQ_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x06>;
+defm S_CMP_LG_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x07>;
+defm S_CMP_GT_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x08>;
+defm S_CMP_GE_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x09>;
+defm S_CMP_LT_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0a>;
+defm S_CMP_LE_U32     : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0b>;
+defm S_BITCMP0_B32    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0c>;
+defm S_BITCMP1_B32    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0d>;
+defm S_BITCMP0_B64    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0e>;
+defm S_BITCMP1_B64    : SOPC_Real_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12_gfx13<0x0f>;
 
 //===----------------------------------------------------------------------===//
 // GFX8 (VI), GFX9.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 6489e63..fddd9c7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -99,7 +99,6 @@ static constexpr CustomOperand MsgOperands[] = {
   {{"MSG_EARLY_PRIM_DEALLOC"},  ID_EARLY_PRIM_DEALLOC,      isGFX9_GFX10},
   {{"MSG_GS_ALLOC_REQ"},        ID_GS_ALLOC_REQ,            isGFX9Plus},
   {{"MSG_GET_DOORBELL"},        ID_GET_DOORBELL,            isGFX9_GFX10},
-  {{"MSG_SAVEWAVE_HAS_TDM"},    ID_SAVEWAVE_HAS_TDM,        isGFX1250},
   {{"MSG_GET_DDID"},            ID_GET_DDID,                isGFX10},
   {{"MSG_SYSMSG"},              ID_SYSMSG},
   {{"MSG_RTN_GET_DOORBELL"},    ID_RTN_GET_DOORBELL,        isGFX11Plus},
@@ -111,7 +110,8 @@ static constexpr CustomOperand MsgOperands[] = {
   {{"MSG_RTN_GET_TBA_TO_PC"},   ID_RTN_GET_TBA_TO_PC,       isGFX11Plus},
   {{"MSG_RTN_GET_SE_AID_ID"},   ID_RTN_GET_SE_AID_ID,       isGFX12Plus},
   {{"MSG_RTN_GET_CLUSTER_BARRIER_STATE"}, ID_RTN_GET_CLUSTER_BARRIER_STATE,
-                                                            isGFX1250},
+                                                            isGFX1250Plus},
+  {{"MSG_RTN_SAVE_WAVE_HAS_TDM"}, ID_RTN_SAVE_WAVE_HAS_TDM, isGFX1250Plus}
 };
 
 static constexpr CustomOperand SysMsgOperands[] = {
@@ -156,6 +156,26 @@ StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
 
 } // namespace SendMsg
 
+namespace WaitEvent {
+
+// clang-format off
+static constexpr CustomOperand WaitEventOperands[] = {
+  {{"{ export_ready: 0 }"},           0,                      isGFX12Plus},
+  {{"{ dont_wait_export_ready: 0 }"}, 0,                      isGFX11},
+  {{"{ dont_wait_export_ready: 1 }"}, DONT_WAIT_EXPORT_READY, isGFX11},
+  {{"{ export_ready: 1 }"},           EXPORT_READY,           isGFX12Plus}
+};
+// clang-format on
+
+int64_t getWaitEventMask(StringRef Name, const MCSubtargetInfo &STI) {
+  return getEncodingFromOperandTable(WaitEventOperands, Name, STI);
+}
+
+StringRef getWaitEventMaskName(uint64_t Encoding, const MCSubtargetInfo &STI) {
+  return getNameFromOperandTable(WaitEventOperands, Encoding, STI);
+}
+} // namespace WaitEvent
+
 namespace Hwreg {
 
 // Disable lint checking for this block since it makes the table unreadable.
@@ -211,8 +231,9 @@ static constexpr CustomOperand Operands[] = {
   {{"HW_REG_HW_ID2"},                 ID_HW_ID2,                      isGFX10Plus},
   {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI,      isGFX940},
   {{"HW_REG_POPS_PACKER"},            ID_POPS_PACKER,                 isGFX10},
+  {{"HW_REG_WAVE_SCHED_MODE"},        ID_SCHED_MODE,                  isGFX12Plus},
   {{"HW_REG_PERF_SNAPSHOT_DATA"},     ID_PERF_SNAPSHOT_DATA_gfx11,    isGFX11},
-  {{"HW_REG_IB_STS2"},                ID_IB_STS2,                     isGFX1250},
+  {{"HW_REG_IB_STS2"},                ID_IB_STS2,                     isGFX1250Plus},
   {{"HW_REG_SHADER_CYCLES"},          ID_SHADER_CYCLES,               isGFX10_3_GFX11},
   {{"HW_REG_SHADER_CYCLES_LO"},       ID_SHADER_CYCLES,               isGFX12Plus},
   {{"HW_REG_SHADER_CYCLES_HI"},       ID_SHADER_CYCLES_HI,            isGFX12Plus},
@@ -220,8 +241,8 @@ static constexpr CustomOperand Operands[] = {
   {{"HW_REG_DVGPR_ALLOC_LO"},         ID_DVGPR_ALLOC_LO,              isGFX12Plus},
   {{"HW_REG_WAVE_DVGPR_ALLOC_HI"},    ID_DVGPR_ALLOC_HI,              isGFX12Plus},
   {{"HW_REG_DVGPR_ALLOC_HI"},         ID_DVGPR_ALLOC_HI,              isGFX12Plus},
-  {{"HW_REG_XNACK_STATE_PRIV"},       ID_XNACK_STATE_PRIV,            isGFX1250},
-  {{"HW_REG_XNACK_MASK"},             ID_XNACK_MASK_gfx1250,          isGFX1250},
+  {{"HW_REG_XNACK_STATE_PRIV"},       ID_XNACK_STATE_PRIV,            isGFX1250Plus},
+  {{"HW_REG_XNACK_MASK"},             ID_XNACK_MASK_gfx1250,          isGFX1250Plus},
 
 };
 // clang-format on
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index c84c1a7..5916e27 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -84,6 +84,11 @@ StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
 
 } // namespace SendMsg
 
+namespace WaitEvent {
+int64_t getWaitEventMask(StringRef Name, const MCSubtargetInfo &STI);
+StringRef getWaitEventMaskName(uint64_t Encoding, const MCSubtargetInfo &STI);
+} // namespace WaitEvent
+
 namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
 
 int64_t getHwregId(StringRef Name, const MCSubtargetInfo &STI);
@@ -127,6 +132,20 @@ ArrayRef<GFXVersion> getGFXVersions();
 
 } // namespace UCVersion
 
+namespace WMMAMods {
+// These should match enum values in SIDefines.h
+
+constexpr const char *const ModMatrixFmt[] = {
+    "MATRIX_FMT_FP8", "MATRIX_FMT_BF8", "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+    "MATRIX_FMT_FP4"};
+
+constexpr const char *const ModMatrixScale[] = {"MATRIX_SCALE_ROW0",
+                                                "MATRIX_SCALE_ROW1"};
+
+constexpr const char *const ModMatrixScaleFmt[] = {
+    "MATRIX_SCALE_FMT_E8", "MATRIX_SCALE_FMT_E5M3", "MATRIX_SCALE_FMT_E4M3"};
+} // namespace WMMAMods
+
 } // namespace AMDGPU
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3e1b058..3f32d11 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -177,7 +177,13 @@ inline unsigned getVaSsrcBitWidth() { return 1; }
 inline unsigned getVaSsrcBitShift() { return 8; }
 
 /// \returns HoldCnt bit shift
-inline unsigned getHoldCntWidth() { return 1; }
+inline unsigned getHoldCntWidth(unsigned VersionMajor, unsigned VersionMinor) {
+  static constexpr const unsigned MinMajor = 10;
+  static constexpr const unsigned MinMinor = 3;
+  return std::tie(VersionMajor, VersionMinor) >= std::tie(MinMajor, MinMinor)
+             ? 1
+             : 0;
+}
 
 /// \returns HoldCnt bit shift
 inline unsigned getHoldCntBitShift() { return 7; }
@@ -188,6 +194,10 @@ namespace llvm {
 
 namespace AMDGPU {
 
+iota_range<InstCounterType> inst_counter_types(InstCounterType MaxCounter) {
+  return enum_seq(LOAD_CNT, MaxCounter);
+}
+
 /// \returns true if the target supports signed immediate offset for SMRD
 /// instructions.
 bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
@@ -349,8 +359,8 @@ unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
 }
 
 struct MUBUFInfo {
-  uint16_t Opcode;
-  uint16_t BaseOpcode;
+  uint32_t Opcode;
+  uint32_t BaseOpcode;
   uint8_t elements;
   bool has_vaddr;
   bool has_srsrc;
@@ -360,8 +370,8 @@ struct MUBUFInfo {
 };
 
 struct MTBUFInfo {
-  uint16_t Opcode;
-  uint16_t BaseOpcode;
+  uint32_t Opcode;
+  uint32_t BaseOpcode;
   uint8_t elements;
   bool has_vaddr;
   bool has_srsrc;
@@ -369,25 +379,25 @@ struct MTBUFInfo {
 };
 
 struct SMInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
   bool IsBuffer;
 };
 
 struct VOPInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
   bool IsSingle;
 };
 
 struct VOPC64DPPInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
 };
 
 struct VOPCDPPAsmOnlyInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
 };
 
 struct VOP3CDPPAsmOnlyInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
 };
 
 struct VOPDComponentInfo {
@@ -398,7 +408,7 @@ struct VOPDComponentInfo {
 };
 
 struct VOPDInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
   uint16_t OpX;
   uint16_t OpY;
   uint16_t Subtarget;
@@ -406,7 +416,7 @@ struct VOPDInfo {
 };
 
 struct VOPTrue16Info {
-  uint16_t Opcode;
+  uint32_t Opcode;
   bool IsTrue16;
 };
 
@@ -414,16 +424,18 @@ struct VOPTrue16Info {
 #define GET_FP4FP8DstByteSelTable_IMPL
 
 struct DPMACCInstructionInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
   bool IsDPMACCInstruction;
 };
 
 struct FP4FP8DstByteSelInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
   bool HasFP8DstByteSel;
   bool HasFP4DstByteSel;
 };
 
+#define GET_DPMACCInstructionTable_DECL
+#define GET_DPMACCInstructionTable_IMPL
 #define GET_MTBUFInfoTable_DECL
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
@@ -729,6 +741,8 @@ bool isGenericAtomic(unsigned Opc) {
          Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN ||
          Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX ||
          Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP ||
+         Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 ||
+         Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 ||
          Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG;
 }
 
@@ -780,6 +794,11 @@ FPType getFPDstSelType(unsigned Opc) {
   return FPType::None;
 }
 
+bool isDPMACCInstruction(unsigned Opc) {
+  const DPMACCInstructionInfo *Info = getDPMACCInstructionHelper(Opc);
+  return Info && Info->IsDPMACCInstruction;
+}
+
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
   const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
   return Info ? Info->Opcode3Addr : ~0u;
@@ -793,7 +812,7 @@ unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) {
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
-int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+int64_t getMCOpcode(uint32_t Opcode, unsigned Gen) {
   return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
 }
 
@@ -897,7 +916,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
 }
 
 std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
-    std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+    std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
     const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR,
     bool VOPD3) const {
 
@@ -914,12 +933,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
       BaseX = X;
     if (!BaseY)
       BaseY = Y;
-    if ((BaseX & BanksMask) == (BaseY & BanksMask))
+    if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask))
       return true;
     if (BaseX != X /* This is 64-bit register */ &&
-        ((BaseX + 1) & BanksMask) == (BaseY & BanksMask))
+        ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask))
       return true;
-    if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask))
+    if (BaseY != Y &&
+        (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask))
       return true;
 
     // If both are 64-bit bank conflict will be detected yet while checking
@@ -968,7 +988,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
 // if the operand is not a register or not a VGPR.
 InstInfo::RegIndices
 InstInfo::getRegIndices(unsigned CompIdx,
-                        std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                        std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
                         bool VOPD3) const {
   assert(CompIdx < COMPONENTS_NUM);
 
@@ -983,7 +1003,7 @@ InstInfo::getRegIndices(unsigned CompIdx,
         Comp.hasRegSrcOperand(CompSrcIdx)
             ? GetRegIdx(CompIdx,
                         Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3))
-            : 0;
+            : MCRegister();
   }
   return RegIndices;
 }
@@ -1709,6 +1729,30 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
   return false;
 }
 
+raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait) {
+  ListSeparator LS;
+  if (Wait.LoadCnt != ~0u)
+    OS << LS << "LoadCnt: " << Wait.LoadCnt;
+  if (Wait.ExpCnt != ~0u)
+    OS << LS << "ExpCnt: " << Wait.ExpCnt;
+  if (Wait.DsCnt != ~0u)
+    OS << LS << "DsCnt: " << Wait.DsCnt;
+  if (Wait.StoreCnt != ~0u)
+    OS << LS << "StoreCnt: " << Wait.StoreCnt;
+  if (Wait.SampleCnt != ~0u)
+    OS << LS << "SampleCnt: " << Wait.SampleCnt;
+  if (Wait.BvhCnt != ~0u)
+    OS << LS << "BvhCnt: " << Wait.BvhCnt;
+  if (Wait.KmCnt != ~0u)
+    OS << LS << "KmCnt: " << Wait.KmCnt;
+  if (Wait.XCnt != ~0u)
+    OS << LS << "XCnt: " << Wait.XCnt;
+  if (LS.unused())
+    OS << "none";
+  OS << '\n';
+  return OS;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
@@ -1751,6 +1795,25 @@ unsigned getStorecntBitMask(const IsaVersion &Version) {
   return (1 << getStorecntBitWidth(Version.Major)) - 1;
 }
 
+HardwareLimits::HardwareLimits(const IsaVersion &IV) {
+  bool HasExtendedWaitCounts = IV.Major >= 12;
+  if (HasExtendedWaitCounts) {
+    LoadcntMax = getLoadcntBitMask(IV);
+    DscntMax = getDscntBitMask(IV);
+  } else {
+    LoadcntMax = getVmcntBitMask(IV);
+    DscntMax = getLgkmcntBitMask(IV);
+  }
+  ExpcntMax = getExpcntBitMask(IV);
+  StorecntMax = getStorecntBitMask(IV);
+  SamplecntMax = getSamplecntBitMask(IV);
+  BvhcntMax = getBvhcntBitMask(IV);
+  KmcntMax = getKmcntBitMask(IV);
+  XcntMax = getXcntBitMask(IV);
+  VaVdstMax = DepCtr::getVaVdstBitMask();
+  VmVsrcMax = DepCtr::getVmVsrcBitMask();
+}
+
 unsigned getWaitcntBitMask(const IsaVersion &Version) {
   unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
                                 getVmcntBitWidthLo(Version.Major));
@@ -2019,6 +2082,22 @@ int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
                              STI);
 }
 
+unsigned getVaVdstBitMask() { return (1 << getVaVdstBitWidth()) - 1; }
+
+unsigned getVaSdstBitMask() { return (1 << getVaSdstBitWidth()) - 1; }
+
+unsigned getVaSsrcBitMask() { return (1 << getVaSsrcBitWidth()) - 1; }
+
+unsigned getHoldCntBitMask(const IsaVersion &Version) {
+  return (1 << getHoldCntWidth(Version.Major, Version.Minor)) - 1;
+}
+
+unsigned getVmVsrcBitMask() { return (1 << getVmVsrcBitWidth()) - 1; }
+
+unsigned getVaVccBitMask() { return (1 << getVaVccBitWidth()) - 1; }
+
+unsigned getSaSdstBitMask() { return (1 << getSaSdstBitWidth()) - 1; }
+
 unsigned decodeFieldVmVsrc(unsigned Encoded) {
   return unpackBits(Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
 }
@@ -2043,64 +2122,74 @@ unsigned decodeFieldVaSsrc(unsigned Encoded) {
   return unpackBits(Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
 }
 
-unsigned decodeFieldHoldCnt(unsigned Encoded) {
-  return unpackBits(Encoded, getHoldCntBitShift(), getHoldCntWidth());
+unsigned decodeFieldHoldCnt(unsigned Encoded, const IsaVersion &Version) {
+  return unpackBits(Encoded, getHoldCntBitShift(),
+                    getHoldCntWidth(Version.Major, Version.Minor));
 }
 
 unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) {
   return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
 }
 
-unsigned encodeFieldVmVsrc(unsigned VmVsrc) {
-  return encodeFieldVmVsrc(0xffff, VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI) {
+  unsigned Encoded = getDefaultDepCtrEncoding(STI);
+  return encodeFieldVmVsrc(Encoded, VmVsrc);
 }
 
 unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) {
   return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth());
 }
 
-unsigned encodeFieldVaVdst(unsigned VaVdst) {
-  return encodeFieldVaVdst(0xffff, VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI) {
+  unsigned Encoded = getDefaultDepCtrEncoding(STI);
+  return encodeFieldVaVdst(Encoded, VaVdst);
 }
 
 unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) {
   return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth());
 }
 
-unsigned encodeFieldSaSdst(unsigned SaSdst) {
-  return encodeFieldSaSdst(0xffff, SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI) {
+  unsigned Encoded = getDefaultDepCtrEncoding(STI);
+  return encodeFieldSaSdst(Encoded, SaSdst);
 }
 
 unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) {
   return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth());
 }
 
-unsigned encodeFieldVaSdst(unsigned VaSdst) {
-  return encodeFieldVaSdst(0xffff, VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI) {
+  unsigned Encoded = getDefaultDepCtrEncoding(STI);
+  return encodeFieldVaSdst(Encoded, VaSdst);
 }
 
 unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) {
   return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth());
 }
 
-unsigned encodeFieldVaVcc(unsigned VaVcc) {
-  return encodeFieldVaVcc(0xffff, VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI) {
+  unsigned Encoded = getDefaultDepCtrEncoding(STI);
+  return encodeFieldVaVcc(Encoded, VaVcc);
 }
 
 unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) {
   return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
 }
 
-unsigned encodeFieldVaSsrc(unsigned VaSsrc) {
-  return encodeFieldVaSsrc(0xffff, VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI) {
+  unsigned Encoded = getDefaultDepCtrEncoding(STI);
+  return encodeFieldVaSsrc(Encoded, VaSsrc);
 }
 
-unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) {
-  return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth());
+unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt,
+                            const IsaVersion &Version) {
+  return packBits(HoldCnt, Encoded, getHoldCntBitShift(),
+                  getHoldCntWidth(Version.Major, Version.Minor));
 }
 
-unsigned encodeFieldHoldCnt(unsigned HoldCnt) {
-  return encodeFieldHoldCnt(0xffff, HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI) {
+  unsigned Encoded = getDefaultDepCtrEncoding(STI);
+  return encodeFieldHoldCnt(Encoded, HoldCnt, getIsaVersion(STI.getCPU()));
 }
 
 } // namespace DepCtr
@@ -2450,7 +2539,7 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) {
 }
 
 unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) {
-  if (isGFX1250(STI))
+  if (isGFX1250Plus(STI))
     return 32;
   return 16;
 }
@@ -2517,14 +2606,26 @@ bool isGFX12(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX12];
 }
 
-bool isGFX12Plus(const MCSubtargetInfo &STI) { return isGFX12(STI); }
+bool isGFX12Plus(const MCSubtargetInfo &STI) {
+  return isGFX12(STI) || isGFX13Plus(STI);
+}
 
 bool isNotGFX12Plus(const MCSubtargetInfo &STI) { return !isGFX12Plus(STI); }
 
 bool isGFX1250(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts] && !isGFX13(STI);
+}
+
+bool isGFX1250Plus(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
 }
 
+bool isGFX13(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX13];
+}
+
+bool isGFX13Plus(const MCSubtargetInfo &STI) { return isGFX13(STI); }
+
 bool supportsWGP(const MCSubtargetInfo &STI) {
   if (isGFX1250(STI))
     return false;
@@ -2578,7 +2679,7 @@ bool hasMAIInsts(const MCSubtargetInfo &STI) {
 }
 
 bool hasVOPD(const MCSubtargetInfo &STI) {
-  return STI.hasFeature(AMDGPU::FeatureVOPD);
+  return STI.hasFeature(AMDGPU::FeatureVOPDInsts);
 }
 
 bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
@@ -2697,8 +2798,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) {
 
 MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG }
 
-bool isInlineValue(unsigned Reg) {
-  switch (Reg) {
+bool isInlineValue(MCRegister Reg) {
+  switch (Reg.id()) {
   case AMDGPU::SRC_SHARED_BASE_LO:
   case AMDGPU::SRC_SHARED_BASE:
   case AMDGPU::SRC_SHARED_LIMIT_LO:
@@ -2743,6 +2844,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
   case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
@@ -3104,6 +3206,34 @@ std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) {
   return getInlineEncodingV216(true, Literal);
 }
 
+// Encoding of the literal as an inline constant for V_PK_FMAC_F16 instruction
+// or nullopt. This accounts for different inline constant behavior:
+// - Pre-GFX11: fp16 inline constants have the value in low 16 bits, 0 in high
+// - GFX11+: fp16 inline constants are duplicated into both halves
+std::optional<unsigned> getPKFMACF16InlineEncoding(uint32_t Literal,
+                                                   bool IsGFX11Plus) {
+  // Pre-GFX11 behavior: f16 in low bits, 0 in high bits
+  if (!IsGFX11Plus)
+    return getInlineEncodingV216(/*IsFloat=*/true, Literal);
+
+  // GFX11+ behavior: f16 duplicated in both halves
+  // First, check for sign-extended integer inline constants (-16 to 64)
+  // These work the same across all generations
+  int32_t Signed = static_cast<int32_t>(Literal);
+  if (Signed >= 0 && Signed <= 64)
+    return 128 + Signed;
+
+  if (Signed >= -16 && Signed <= -1)
+    return 192 + std::abs(Signed);
+
+  // For float inline constants on GFX11+, both halves must be equal
+  uint16_t Lo = static_cast<uint16_t>(Literal);
+  uint16_t Hi = static_cast<uint16_t>(Literal >> 16);
+  if (Lo != Hi)
+    return std::nullopt;
+  return getInlineEncodingV216(/*IsFloat=*/true, Lo);
+}
+
 // Whether the given literal can be inlined for a V_PK_* instruction.
 bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
   switch (OpType) {
@@ -3113,6 +3243,8 @@ bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return getInlineEncodingV216(true, Literal).has_value();
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
+    llvm_unreachable("OPERAND_REG_IMM_V2FP16_SPLAT is not supported");
   case AMDGPU::OPERAND_REG_IMM_V2BF16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
     return isInlinableLiteralV2BF16(Literal);
@@ -3138,6 +3270,11 @@ bool isInlinableLiteralV2F16(uint32_t Literal) {
   return getInlineEncodingV2F16(Literal).has_value();
 }
 
+// Whether the given literal can be inlined for V_PK_FMAC_F16 instruction.
+bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus) {
+  return getPKFMACF16InlineEncoding(Literal, IsGFX11Plus).has_value();
+}
+
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
   if (IsFP64)
     return !Lo_32(Val);
@@ -3159,6 +3296,7 @@ int64_t encode32BitLiteral(int64_t Imm, OperandType Type, bool IsLit) {
   case OPERAND_REG_IMM_INT32:
   case OPERAND_REG_IMM_V2BF16:
   case OPERAND_REG_IMM_V2FP16:
+  case OPERAND_REG_IMM_V2FP16_SPLAT:
   case OPERAND_REG_IMM_V2FP32:
   case OPERAND_REG_IMM_V2INT16:
   case OPERAND_REG_IMM_V2INT32:
@@ -3361,7 +3499,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                           : getGfx9BufferFormatInfo(Format);
 }
 
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
                                            const MCRegisterInfo &MRI) {
   const unsigned VGPRClasses[] = {
       AMDGPU::VGPR_16RegClassID,  AMDGPU::VGPR_32RegClassID,
@@ -3382,22 +3520,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
   return nullptr;
 }
 
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   return Idx >> 8;
 }
 
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
-                          const MCRegisterInfo &MRI) {
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+                           const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   if (Idx >= 0x100)
-    return AMDGPU::NoRegister;
+    return MCRegister();
 
   const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
   if (!RC)
-    return AMDGPU::NoRegister;
+    return MCRegister();
 
   Idx |= MSBs << 8;
   if (RC->getID() == AMDGPU::VGPR_16RegClassID) {
@@ -3438,17 +3576,42 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
       AMDGPU::OpName::src0Y, AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vsrc2Y,
       AMDGPU::OpName::vdstY};
 
+  // VOP2 MADMK instructions use src0, imm, src1 scheme.
+  static const AMDGPU::OpName VOP2MADMKOps[4] = {
+      AMDGPU::OpName::src0, AMDGPU::OpName::NUM_OPERAND_NAMES,
+      AMDGPU::OpName::src1, AMDGPU::OpName::vdst};
+  static const AMDGPU::OpName VOPDFMAMKOpsX[4] = {
+      AMDGPU::OpName::src0X, AMDGPU::OpName::NUM_OPERAND_NAMES,
+      AMDGPU::OpName::vsrc1X, AMDGPU::OpName::vdstX};
+  static const AMDGPU::OpName VOPDFMAMKOpsY[4] = {
+      AMDGPU::OpName::src0Y, AMDGPU::OpName::NUM_OPERAND_NAMES,
+      AMDGPU::OpName::vsrc1Y, AMDGPU::OpName::vdstY};
+
   unsigned TSFlags = Desc.TSFlags;
 
   if (TSFlags &
       (SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 |
        SIInstrFlags::VOP3P | SIInstrFlags::VOPC | SIInstrFlags::DPP)) {
+    switch (Desc.getOpcode()) {
     // LD_SCALE operands ignore MSB.
-    if (Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32 ||
-        Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250 ||
-        Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64 ||
-        Desc.getOpcode() == AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250)
+    case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32:
+    case AMDGPU::V_WMMA_LD_SCALE_PAIRED_B32_gfx1250:
+    case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64:
+    case AMDGPU::V_WMMA_LD_SCALE16_PAIRED_B64_gfx1250:
       return {};
+    case AMDGPU::V_FMAMK_F16:
+    case AMDGPU::V_FMAMK_F16_t16:
+    case AMDGPU::V_FMAMK_F16_t16_gfx12:
+    case AMDGPU::V_FMAMK_F16_fake16:
+    case AMDGPU::V_FMAMK_F16_fake16_gfx12:
+    case AMDGPU::V_FMAMK_F32:
+    case AMDGPU::V_FMAMK_F32_gfx12:
+    case AMDGPU::V_FMAMK_F64:
+    case AMDGPU::V_FMAMK_F64_gfx1250:
+      return {VOP2MADMKOps, nullptr};
+    default:
+      break;
+    }
     return {VOPOps, nullptr};
   }
 
@@ -3464,8 +3627,11 @@ getVGPRLoweringOperandTables(const MCInstrDesc &Desc) {
   if (TSFlags & SIInstrFlags::VIMAGE)
     return {VIMGOps, nullptr};
 
-  if (AMDGPU::isVOPD(Desc.getOpcode()))
-    return {VOPDOpsX, VOPDOpsY};
+  if (AMDGPU::isVOPD(Desc.getOpcode())) {
+    auto [OpX, OpY] = getVOPDComponents(Desc.getOpcode());
+    return {(OpX == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsX : VOPDOpsX,
+            (OpY == AMDGPU::V_FMAMK_F32) ? VOPDFMAMKOpsY : VOPDOpsY};
+  }
 
   assert(!(TSFlags & SIInstrFlags::MIMG));
 
@@ -3545,8 +3711,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
 }
 
 unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
-  return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
-                                                                        : 128;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768))
+    return 64;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
+    return 128;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+    return 320;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+    return 512;
+  return 64; // In sync with getAddressableLocalMemorySize
 }
 
 bool isPackedFP32Inst(unsigned Opc) {
@@ -3599,9 +3772,9 @@ ClusterDimsAttr ClusterDimsAttr::get(const Function &F) {
 
   if (!Attr.has_value())
     AttrKind = Kind::Unknown;
-  else if (all_of(*Attr, [](unsigned V) { return V == EncoNoCluster; }))
+  else if (all_of(*Attr, equal_to(EncoNoCluster)))
     AttrKind = Kind::NoCluster;
-  else if (all_of(*Attr, [](unsigned V) { return V == EncoVariableDims; }))
+  else if (all_of(*Attr, equal_to(EncoVariableDims)))
     AttrKind = Kind::VariableDims;
 
   ClusterDimsAttr A(AttrKind);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 5e3195b..7500c24 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -98,7 +98,7 @@ struct GcnBufferFormatInfo {
 };
 
 struct MAIInstInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
   bool is_dgemm;
   bool is_gfx940_xdl;
 };
@@ -121,7 +121,7 @@ struct True16D16Info {
 };
 
 struct WMMAInstInfo {
-  uint16_t Opcode;
+  uint32_t Opcode;
   bool is_wmma_xdl;
 };
 
@@ -416,7 +416,7 @@ inline bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx) {
 }
 
 LLVM_READONLY
-int getSOPPWithRelaxation(uint16_t Opcode);
+int64_t getSOPPWithRelaxation(uint32_t Opcode);
 
 struct MIMGBaseOpcodeInfo {
   MIMGBaseOpcode BaseOpcode;
@@ -522,8 +522,8 @@ unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode,
                            bool IsG16Supported);
 
 struct MIMGInfo {
-  uint16_t Opcode;
-  uint16_t BaseOpcode;
+  uint32_t Opcode;
+  uint32_t BaseOpcode;
   uint8_t MIMGEncoding;
   uint8_t VDataDwords;
   uint8_t VAddrDwords;
@@ -646,7 +646,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                                                   const MCSubtargetInfo &STI);
 
 LLVM_READONLY
-int getMCOpcode(uint16_t Opcode, unsigned Gen);
+int64_t getMCOpcode(uint32_t Opcode, unsigned Gen);
 
 LLVM_READONLY
 unsigned getVOPDOpcode(unsigned Opc, bool VOPD3);
@@ -909,7 +909,7 @@ private:
   const ComponentInfo CompInfo[COMPONENTS_NUM];
 
 public:
-  using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>;
+  using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>;
 
   InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY)
       : CompInfo{OpX, OpY} {}
@@ -932,9 +932,10 @@ public:
   // even though it violates requirement to be from different banks.
   // If \p VOPD3 is set to true both dst registers allowed to be either odd
   // or even and instruction may have real src2 as opposed to tied accumulator.
-  bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx,
-                         const MCRegisterInfo &MRI, bool SkipSrc = false,
-                         bool AllowSameVGPR = false, bool VOPD3 = false) const {
+  bool
+  hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
+                    const MCRegisterInfo &MRI, bool SkipSrc = false,
+                    bool AllowSameVGPR = false, bool VOPD3 = false) const {
     return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR,
                                       VOPD3)
         .has_value();
@@ -949,14 +950,14 @@ public:
   // If \p VOPD3 is set to true both dst registers allowed to be either odd
   // or even and instruction may have real src2 as opposed to tied accumulator.
   std::optional<unsigned> getInvalidCompOperandIndex(
-      std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+      std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
       const MCRegisterInfo &MRI, bool SkipSrc = false,
       bool AllowSameVGPR = false, bool VOPD3 = false) const;
 
 private:
   RegIndices
   getRegIndices(unsigned ComponentIdx,
-                std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
                 bool VOPD3) const;
 };
 
@@ -1075,6 +1076,37 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
 /// Checks if \p Val is inside \p MD, a !range-like metadata.
 bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
 
+enum InstCounterType {
+  LOAD_CNT = 0, // VMcnt prior to gfx12.
+  DS_CNT,       // LKGMcnt prior to gfx12.
+  EXP_CNT,      //
+  STORE_CNT,    // VScnt in gfx10/gfx11.
+  NUM_NORMAL_INST_CNTS,
+  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+  BVH_CNT,                           // gfx12+ only.
+  KM_CNT,                            // gfx12+ only.
+  X_CNT,                             // gfx1250.
+  NUM_EXTENDED_INST_CNTS,
+  VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
+  VM_VSRC,                          // gfx12+ expert mode only.
+  NUM_EXPERT_INST_CNTS,
+  NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
+};
+
+// Return an iterator over all counters between LOAD_CNT (the first counter)
+// and \c MaxCounter (exclusive, default value yields an enumeration over
+// all counters).
+iota_range<InstCounterType>
+inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS);
+
+} // namespace AMDGPU
+
+template <> struct enum_iteration_traits<AMDGPU::InstCounterType> {
+  static constexpr bool is_iterable = true;
+};
+
+namespace AMDGPU {
+
 /// Represents the counter values to wait for in an s_waitcnt instruction.
 ///
 /// Large values (including the maximum possible integer) can be used to
@@ -1088,6 +1120,71 @@ struct Waitcnt {
   unsigned BvhCnt = ~0u;    // gfx12+ only.
   unsigned KmCnt = ~0u;     // gfx12+ only.
   unsigned XCnt = ~0u;      // gfx1250.
+  unsigned VaVdst = ~0u;    // gfx12+ expert scheduling mode only.
+  unsigned VmVsrc = ~0u;    // gfx12+ expert scheduling mode only.
+
+  unsigned get(InstCounterType T) const {
+    switch (T) {
+    case LOAD_CNT:
+      return LoadCnt;
+    case EXP_CNT:
+      return ExpCnt;
+    case DS_CNT:
+      return DsCnt;
+    case STORE_CNT:
+      return StoreCnt;
+    case SAMPLE_CNT:
+      return SampleCnt;
+    case BVH_CNT:
+      return BvhCnt;
+    case KM_CNT:
+      return KmCnt;
+    case X_CNT:
+      return XCnt;
+    case VA_VDST:
+      return VaVdst;
+    case VM_VSRC:
+      return VmVsrc;
+    default:
+      llvm_unreachable("bad InstCounterType");
+    }
+  }
+  void set(InstCounterType T, unsigned Val) {
+    switch (T) {
+    case LOAD_CNT:
+      LoadCnt = Val;
+      break;
+    case EXP_CNT:
+      ExpCnt = Val;
+      break;
+    case DS_CNT:
+      DsCnt = Val;
+      break;
+    case STORE_CNT:
+      StoreCnt = Val;
+      break;
+    case SAMPLE_CNT:
+      SampleCnt = Val;
+      break;
+    case BVH_CNT:
+      BvhCnt = Val;
+      break;
+    case KM_CNT:
+      KmCnt = Val;
+      break;
+    case X_CNT:
+      XCnt = Val;
+      break;
+    case VA_VDST:
+      VaVdst = Val;
+      break;
+    case VM_VSRC:
+      VmVsrc = Val;
+      break;
+    default:
+      llvm_unreachable("bad InstCounterType");
+    }
+  }
 
   Waitcnt() = default;
   // Pre-gfx12 constructor.
@@ -1096,19 +1193,24 @@ struct Waitcnt {
 
   // gfx12+ constructor.
   Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
-          unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt)
+          unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
+          unsigned VaVdst, unsigned VmVsrc)
       : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
-        SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt) {}
+        SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt), XCnt(XCnt),
+        VaVdst(VaVdst), VmVsrc(VmVsrc) {}
 
   bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
 
   bool hasWaitExceptStoreCnt() const {
     return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
-           SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u;
+           SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u || XCnt != ~0u ||
+           VaVdst != ~0u || VmVsrc != ~0u;
   }
 
   bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
 
+  bool hasWaitDepctr() const { return VaVdst != ~0u || VmVsrc != ~0u; }
+
   Waitcnt combined(const Waitcnt &Other) const {
     // Does the right thing provided self and Other are either both pre-gfx12
     // or both gfx12+.
@@ -1116,8 +1218,30 @@ struct Waitcnt {
         std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
         std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
         std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
-        std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt));
+        std::min(KmCnt, Other.KmCnt), std::min(XCnt, Other.XCnt),
+        std::min(VaVdst, Other.VaVdst), std::min(VmVsrc, Other.VmVsrc));
   }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait);
+};
+
+/// Represents the hardware counter limits for different wait count types.
+struct HardwareLimits {
+  unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
+  unsigned ExpcntMax;
+  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
+  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
+  unsigned SamplecntMax; // gfx12+ only.
+  unsigned BvhcntMax;    // gfx12+ only.
+  unsigned KmcntMax;     // gfx12+ only.
+  unsigned XcntMax;      // gfx1250.
+  unsigned VaVdstMax;    // gfx12+ expert mode only.
+  unsigned VmVsrcMax;    // gfx12+ expert mode only.
+
+  HardwareLimits() = default;
+
+  /// Initializes hardware limits from ISA version.
+  HardwareLimits(const IsaVersion &IV);
 };
 
 // The following methods are only meaningful on targets that support
@@ -1278,6 +1402,27 @@ bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
 bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
                   bool &IsDefault, const MCSubtargetInfo &STI);
 
+/// \returns Maximum VaVdst value that can be encoded.
+unsigned getVaVdstBitMask();
+
+/// \returns Maximum VaSdst value that can be encoded.
+unsigned getVaSdstBitMask();
+
+/// \returns Maximum VaSsrc value that can be encoded.
+unsigned getVaSsrcBitMask();
+
+/// \returns Maximum HoldCnt value that can be encoded.
+unsigned getHoldCntBitMask(const IsaVersion &Version);
+
+/// \returns Maximum VmVsrc value that can be encoded.
+unsigned getVmVsrcBitMask();
+
+/// \returns Maximum VaVcc value that can be encoded.
+unsigned getVaVccBitMask();
+
+/// \returns Maximum SaSdst value that can be encoded.
+unsigned getSaSdstBitMask();
+
 /// \returns Decoded VaVdst from given immediate \p Encoded.
 unsigned decodeFieldVaVdst(unsigned Encoded);
 
@@ -1297,46 +1442,47 @@ unsigned decodeFieldVaVcc(unsigned Encoded);
 unsigned decodeFieldVaSsrc(unsigned Encoded);
 
 /// \returns Decoded HoldCnt from given immediate \p Encoded.
-unsigned decodeFieldHoldCnt(unsigned Encoded);
+unsigned decodeFieldHoldCnt(unsigned Encoded, const IsaVersion &Version);
 
 /// \returns \p VmVsrc as an encoded Depctr immediate.
-unsigned encodeFieldVmVsrc(unsigned VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI);
 
 /// \returns \p Encoded combined with encoded \p VmVsrc.
 unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc);
 
 /// \returns \p VaVdst as an encoded Depctr immediate.
-unsigned encodeFieldVaVdst(unsigned VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI);
 
 /// \returns \p Encoded combined with encoded \p VaVdst.
 unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst);
 
 /// \returns \p SaSdst as an encoded Depctr immediate.
-unsigned encodeFieldSaSdst(unsigned SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI);
 
 /// \returns \p Encoded combined with encoded \p SaSdst.
 unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst);
 
 /// \returns \p VaSdst as an encoded Depctr immediate.
-unsigned encodeFieldVaSdst(unsigned VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI);
 
 /// \returns \p Encoded combined with encoded \p VaSdst.
 unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst);
 
 /// \returns \p VaVcc as an encoded Depctr immediate.
-unsigned encodeFieldVaVcc(unsigned VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI);
 
 /// \returns \p Encoded combined with encoded \p VaVcc.
 unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc);
 
 /// \returns \p HoldCnt as an encoded Depctr immediate.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI);
 
 /// \returns \p Encoded combined with encoded \p HoldCnt.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded);
+unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt,
+                            const IsaVersion &Version);
 
 /// \returns \p VaSsrc as an encoded Depctr immediate.
-unsigned encodeFieldVaSsrc(unsigned VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI);
 
 /// \returns \p Encoded combined with encoded \p VaSsrc.
 unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc);
@@ -1513,6 +1659,8 @@ constexpr inline bool isKernel(CallingConv::ID CC) {
   }
 }
 
+inline bool isKernel(const Function &F) { return isKernel(F.getCallingConv()); }
+
 LLVM_READNONE
 constexpr bool canGuaranteeTCO(CallingConv::ID CC) {
   return CC == CallingConv::Fast;
@@ -1561,6 +1709,9 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
 bool isGFX12(const MCSubtargetInfo &STI);
 bool isGFX12Plus(const MCSubtargetInfo &STI);
 bool isGFX1250(const MCSubtargetInfo &STI);
+bool isGFX1250Plus(const MCSubtargetInfo &STI);
+bool isGFX13(const MCSubtargetInfo &STI);
+bool isGFX13Plus(const MCSubtargetInfo &STI);
 bool supportsWGP(const MCSubtargetInfo &STI);
 bool isNotGFX12Plus(const MCSubtargetInfo &STI);
 bool isNotGFX11Plus(const MCSubtargetInfo &STI);
@@ -1599,7 +1750,7 @@ LLVM_READNONE
 MCRegister mc2PseudoReg(MCRegister Reg);
 
 LLVM_READNONE
-bool isInlineValue(unsigned Reg);
+bool isInlineValue(MCRegister Reg);
 
 /// Is this an AMDGPU specific source operand? These include registers,
 /// inline constants, literals and mandatory literals (KImm).
@@ -1663,6 +1814,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_IMM_V2BF16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
   case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
     return 2;
 
@@ -1709,6 +1861,10 @@ LLVM_READNONE
 std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal);
 
 LLVM_READNONE
+std::optional<unsigned> getPKFMACF16InlineEncoding(uint32_t Literal,
+                                                   bool IsGFX11Plus);
+
+LLVM_READNONE
 bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType);
 
 LLVM_READNONE
@@ -1721,6 +1877,9 @@ LLVM_READNONE
 bool isInlinableLiteralV2F16(uint32_t Literal);
 
 LLVM_READNONE
+bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus);
+
+LLVM_READNONE
 bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
 
 LLVM_READNONE
@@ -1798,16 +1957,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID);
 
 /// \returns a register class for the physical register \p Reg if it is a VGPR
 /// or nullptr otherwise.
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
                                            const MCRegisterInfo &MRI);
 
 /// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
 /// physical register \p Reg.
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI);
 
 /// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
-                          const MCRegisterInfo &MRI);
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+                           const MCRegisterInfo &MRI);
 
 // Returns a table for the opcode with a given \p Desc to map the VGPR MSB
 // set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
@@ -1867,7 +2026,7 @@ private:
   Kind AttrKind = Kind::Unknown;
 };
 
-} // end namespace AMDGPU
+} // namespace AMDGPU
 
 raw_ostream &operator<<(raw_ostream &OS,
                         const AMDGPU::IsaInfo::TargetIDSetting S);
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 5e89e34..75437cf 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -10,7 +10,7 @@
 // VINTERP encoding
 //===----------------------------------------------------------------------===//
 
-class VINTERPe <VOPProfile P> : Enc64 {
+class VINTERPe : Enc64 {
   bits<11> vdst;
   bits<4> src0_modifiers;
   bits<11> src0;
@@ -27,10 +27,10 @@ class VINTERPe <VOPProfile P> : Enc64 {
   let Inst{7-0}   = vdst{7-0};
   let Inst{10-8}  = waitexp;
   // Fields for hi/lo 16-bits of register selection
-  let Inst{11} = !if(P.HasSrc0, src0_modifiers{2}, 0);
-  let Inst{12} = !if(P.HasSrc1, src1_modifiers{2}, 0);
-  let Inst{13} = !if(P.HasSrc2, src2_modifiers{2}, 0);
-  let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0);
+  let Inst{11}    = src0_modifiers{2};
+  let Inst{12}    = src1_modifiers{2};
+  let Inst{13}    = src2_modifiers{2};
+  let Inst{14}    = src0_modifiers{3};
   let Inst{15}    = clamp;
   let Inst{40-32} = src0{8-0};
   let Inst{49-41} = src1{8-0};
@@ -40,11 +40,11 @@ class VINTERPe <VOPProfile P> : Enc64 {
   let Inst{63}    = src2_modifiers{0}; // neg(2)
 }
 
-class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : VINTERPe<P> {
+class VINTERPe_gfx11 <bits<7> op> : VINTERPe {
   let Inst{22-16} = op;
 }
 
-class VINTERPe_gfx12 <bits<7> op, VOPProfile P> : VINTERPe<P> {
+class VINTERPe_gfx12 <bits<7> op> : VINTERPe {
   let Inst{20-16} = op{4-0};
 }
 
@@ -243,7 +243,7 @@ multiclass VINTERP_Real_gfx11 <bits<7> op, string asmName> {
                            !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
     def _gfx11 :
       VINTERP_Real<ps, SIEncodingFamily.GFX11, asmName>,
-      VINTERPe_gfx11<op, ps.Pfl>;
+      VINTERPe_gfx11<op>;
   }
 }
 
@@ -253,7 +253,7 @@ multiclass VINTERP_Real_gfx12 <bits<7> op, string asmName> {
                            !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
     def _gfx12 :
       VINTERP_Real<ps, SIEncodingFamily.GFX12, asmName>,
-      VINTERPe_gfx12<op, ps.Pfl>;
+      VINTERPe_gfx12<op>;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 54f57e0..56e7623 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -263,16 +263,19 @@ let HasOMod = 0, HasClamp = 0 in {
 let isReMaterializable = 1 in {
 let SchedRW = [WriteDoubleCvt] in {
 // OMod clears exceptions when set in this instruction
+let IsDPMACCInstruction = 1 in
 defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD,  fp_to_sint>;
 
 let mayRaiseFPException = 0 in {
 defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
 }
 
+let IsDPMACCInstruction = 1 in {
 defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64,  fpround>;
-defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32,  any_fpextend>;
 // OMod clears exceptions when set in this instruction
 defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD,  fp_to_uint>;
+} // IsDPMACCInstruction = 1
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32,  any_fpextend>;
 
 let mayRaiseFPException = 0 in {
 defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
@@ -349,11 +352,11 @@ defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
 defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, int_amdgcn_sqrt>;
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 
-let TRANS = 1, SchedRW = [WriteTrans64] in {
+let TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1 in {
 defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64_NO_DPP, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64_NO_DPP, AMDGPUrsq>;
 defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64_NO_DPP, int_amdgcn_sqrt>;
-} // End TRANS = 1, SchedRW = [WriteTrans64]
+} // End TRANS = 1, SchedRW = [WriteTrans64], IsDPMACCInstruction = 1
 
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
@@ -369,18 +372,45 @@ defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
 defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
 defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
 
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
 defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
 let FPDPRounding = 1 in {
 defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
 } // End FPDPRounding = 1
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
 
 defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
 defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
 } // End isReMaterializable = 1
 
+// These i32 conversions naturally saturate.
+def : GCNPat<(i32 (fp_to_uint_sat (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), i32)),
+             (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), i32)),
+             (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)), i32)),
+             (V_CVT_U32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)), i32)),
+             (V_CVT_I32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat f32:$src0, i32)), (V_CVT_U32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat f32:$src0, i32)), (V_CVT_I32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_uint_sat f64:$src0, i32)), (V_CVT_U32_F64_e32 (f64 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat f64:$src0, i32)), (V_CVT_I32_F64_e32 (f64 $src0))>;
+
+def : GCNPat<(i32 (fp_to_uint_sat_gi (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+             (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+             (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat_gi (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))),
+             (V_CVT_U32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))),
+             (V_CVT_I32_F64_e64 $src0_modifiers, $src0)>;
+def : GCNPat<(i32 (fp_to_uint_sat_gi f32:$src0)), (V_CVT_U32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi f32:$src0)), (V_CVT_I32_F32_e32 (f32 $src0))>;
+def : GCNPat<(i32 (fp_to_uint_sat_gi f64:$src0)), (V_CVT_U32_F64_e32 (f64 $src0))>;
+def : GCNPat<(i32 (fp_to_sint_sat_gi f64:$src0)), (V_CVT_I32_F64_e32 (f64 $src0))>;
+
 defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
 
 // Restrict src0 to be VGPR
@@ -493,12 +523,12 @@ let SubtargetPredicate = isGFX7GFX8GFX9 in {
 } // End SubtargetPredicate = isGFX7GFX8GFX9
 
 let SubtargetPredicate = isGFX7Plus in {
-  let SchedRW = [WriteDoubleAdd] in {
+  let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
     defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>;
     defm V_CEIL_F64  : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>;
     defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, froundeven>;
     defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
-  } // End SchedRW = [WriteDoubleAdd]
+  } // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
 } // End SubtargetPredicate = isGFX7Plus
 } // End isReMaterializable = 1
 
@@ -513,6 +543,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16",
 defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16",
   VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>;
 
+
+let HasClamp = 0, HasOMod = 0 in {
+def V_TRANS_BF16_Profile :  VOPProfile <[bf16, bf16, untyped, untyped]>;
+def V_TRANS_BF16_t16_Profile :  VOPProfile_True16 <VOP_BF16_BF16>;
+def V_TRANS_BF16_fake16_Profile :  VOPProfile_Fake16 <VOP_BF16_BF16>;
+}
+
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -527,14 +564,30 @@ defm V_TANH_F16  : VOP1Inst_t16 <"v_tanh_f16",  VOP_F16_F16, int_amdgcn_tanh>;
 }
 
 let SubtargetPredicate = HasBF16TransInsts in {
-defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
-defm V_RCP_BF16  : VOP1Inst_t16 <"v_rcp_bf16",  VOP_BF16_BF16, AMDGPUrcp>;
-defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
-defm V_RSQ_BF16  : VOP1Inst_t16 <"v_rsq_bf16",  VOP_BF16_BF16, AMDGPUrsq>;
-defm V_LOG_BF16  : VOP1Inst_t16 <"v_log_bf16",  VOP_BF16_BF16, AMDGPUlogf16>;
-defm V_EXP_BF16  : VOP1Inst_t16 <"v_exp_bf16",  VOP_BF16_BF16, AMDGPUexpf16>;
-defm V_SIN_BF16  : VOP1Inst_t16 <"v_sin_bf16",  VOP_BF16_BF16, AMDGPUsin>;
-defm V_COS_BF16  : VOP1Inst_t16 <"v_cos_bf16",  VOP_BF16_BF16, AMDGPUcos>;
+defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               int_amdgcn_tanh>;
+defm V_RCP_BF16  : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               any_amdgcn_sqrt>;
+defm V_RSQ_BF16  : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUrsq>;
+defm V_LOG_BF16  : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUlogf16>;
+defm V_EXP_BF16  : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUexpf16>;
+defm V_SIN_BF16  : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUsin>;
+defm V_COS_BF16  : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUcos>;
 }
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -593,15 +646,15 @@ let SubtargetPredicate = isGFX9Plus in {
 
   let isReMaterializable = 1 in
   defm V_SAT_PK_U8_I16    : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>;
-
-  let mayRaiseFPException = 0 in {
-  defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16",
-      VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
-  defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16",
-      VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
-  } // End mayRaiseFPException = 0
 } // End SubtargetPredicate = isGFX9Plus
 
+let mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts in {
+defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_i16_f16",
+    VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
+defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_norm_u16_f16",
+    VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
+} // End mayRaiseFPException = 0, SubtargetPredicate = HasCvtNormInsts
+
 let SubtargetPredicate = isGFX9Only in {
   defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
 } // End SubtargetPredicate = isGFX9Only
@@ -644,7 +697,7 @@ let OtherPredicates = [HasCvtFP8VOP1Bug] in {
                (V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>;
 }
 
-let OtherPredicates = [HasNoCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12
+let OtherPredicates = [NotHasCvtFP8VOP1Bug, HasSDWA] in { // FIXME: HasSDWA is a substitute for !gfx12
   def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
                (V_CVT_F32_FP8_e32 $src)>;
   def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),
@@ -707,9 +760,9 @@ def V_CVT_F16_F8_True16_Profile : VOP3_Profile_True16<V_CVT_F16_F8_Profile>;
 def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
 }
 
-let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
+let SubtargetPredicate = isGFX11Plus, OtherPredicates = [HasFP8ConversionInsts],
     mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
-  let SubtargetPredicate = isGFX12PlusNot12_50 in
+  let SubtargetPredicate = isGFX11PlusNot12_50 in
     defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
   let SubtargetPredicate = isGFX125xOnly in
     defm V_CVT_F32_FP8_gfx1250   : VOP1Inst<"v_cvt_f32_fp8_gfx1250", VOPProfile_Base_CVT_F_F8_ByteSel<f32, 1>>;
@@ -733,7 +786,7 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe
 >;
 
 let OtherPredicates = [HasFP8ConversionInsts] in {
-  let SubtargetPredicate = isGFX12PlusNot12_50 in
+  let SubtargetPredicate = isGFX11PlusNot12_50 in
     def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
   let SubtargetPredicate = isGFX125xOnly in {
     def : GCNPat<(int_amdgcn_cvt_f32_fp8 i32:$src0, timm:$byte_sel),
@@ -741,7 +794,7 @@ let OtherPredicates = [HasFP8ConversionInsts] in {
     def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel),
                  (V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>;
   }
-  let SubtargetPredicate = isGFX12Plus in
+  let SubtargetPredicate = isGFX11Plus in
     def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
 }
 
@@ -753,7 +806,7 @@ class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
          (inst_e32 $src))
 >;
 
-let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
+let SubtargetPredicate = isGFX11Plus, OtherPredicates = [HasFP8ConversionInsts] in {
   foreach Index = [0, -1] in {
     def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
                                   V_CVT_PK_F32_FP8_fake16_e32, V_CVT_PK_F32_FP8_fake16_e64>;
@@ -839,7 +892,7 @@ let SubtargetPredicate = isGFX11Plus in {
   // Restrict src0 to be VGPR
   def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
                                       [], /*VOP1Only=*/ 1>;
-  let isAsCheapAsAMove = 1 in
+  let isAsCheapAsAMove = 1, isMoveImm = 1 in
   defm V_MOV_B16        : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
   defm V_NOT_B16        : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
   defm V_CVT_I32_I16    : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
@@ -927,7 +980,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf
 }
 
 //===----------------------------------------------------------------------===//
-// GFX11, GFX12
+// GFX11, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
 multiclass VOP1Only_Real<GFXGen Gen, bits<9> op> {
@@ -1001,10 +1054,19 @@ multiclass VOP1_Realtriple_e64_with_name<GFXGen Gen, bits<9> op, string opName,
     asmName>;
 }
 
+multiclass VOP1_Realtriple_e64_with_name_gfx12_gfx13<
+    bits<9> op, string opName, string asmName> :
+  VOP1_Realtriple_e64_with_name<GFX12Gen, op, opName, asmName>,
+  VOP1_Realtriple_e64_with_name<GFX13Gen, op, opName, asmName>;
+
 multiclass VOP1_Real_FULL<GFXGen Gen, bits<9> op> :
   VOP1_Real_e32<Gen, op>, VOP1_Realtriple_e64<Gen, op>,
   VOP1_Real_dpp<Gen, op>, VOP1_Real_dpp8<Gen, op>;
 
+multiclass VOP1_Real_FULL_gfx1250_gfx13<bits<9> op> :
+  VOP1_Real_FULL<GFX1250Gen, op>,
+  VOP1_Real_FULL<GFX13Gen, op>;
+
 multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
                                              string asmName> {
   defm NAME : VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>,
@@ -1016,11 +1078,14 @@ multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
   }
 }
 
-multiclass VOP1_Real_NO_VOP3_with_name_gfx12<bits<9> op, string opName,
-                                             string asmName> {
+multiclass VOP1_Real_NO_VOP3_with_name_gfx12_gfx13<
+    bits<9> op, string opName, string asmName> {
   defm NAME : VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>,
               VOP1_Real_dpp_with_name<GFX12Gen, op, opName, asmName>,
               VOP1_Real_dpp8_with_name<GFX12Gen, op, opName, asmName>;
+  defm NAME : VOP1_Real_e32_with_name<GFX13Gen, op, opName, asmName>,
+              VOP1_Real_dpp_with_name<GFX13Gen, op, opName, asmName>,
+              VOP1_Real_dpp8_with_name<GFX13Gen, op, opName, asmName>;
 }
 
 multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName,
@@ -1030,6 +1095,11 @@ multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName,
   VOP1_Real_dpp8_with_name<Gen, op, opName, asmName>,
   VOP1_Realtriple_e64_with_name<Gen, op, opName, asmName>;
 
+multiclass VOP1_Real_FULL_with_name_gfx1250_gfx13<
+    bits<9> op, string opName, string asmName> :
+  VOP1_Real_FULL_with_name<GFX1250Gen, op, opName, asmName>,
+  VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
+
 multiclass VOP1_Real_NO_DPP<GFXGen Gen, bits<9> op> :
   VOP1_Real_e32<Gen, op>, VOP1_Real_e64<Gen, op>;
 
@@ -1038,134 +1108,159 @@ multiclass VOP1_Real_with_DPP16<GFXGen Gen, bits<9> op> :
   VOP1_Real_dpp<Gen, op>,
   VOP3_Real_dpp_Base<Gen, {0, 1, 1, op{6-0}}>;
 
-multiclass VOP1_Real_FULL_t16_gfx11_gfx12<bits<9> op, string asmName,
-                                          string opName = NAME> :
+multiclass VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<
+    bits<9> op, string asmName, string opName = NAME> :
   VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
-  VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+  VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>,
+  VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
+
+multiclass VOP1_Real_FULL_with_name_gfx12_gfx13<
+    bits<9> op, string opName, string asmName> :
+  VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>,
+  VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
 
-multiclass VOP1_Real_FULL_with_name_gfx11_gfx12<bits<9> op, string opName,
-                                                string asmName> :
+multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<
+    bits<9> op, string opName, string asmName> :
   VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
-  VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+  VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>,
+  VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
 
-multiclass VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<
+multiclass VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<
     bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
   defm opName#"_t16" :
-       VOP1_Real_FULL_with_name_gfx11_gfx12<op, opName#"_t16", asmName>;
+       VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<op, opName#"_t16", asmName>;
   defm opName#"_fake16":
-       VOP1_Real_FULL_with_name_gfx11_gfx12<op, opName#"_fake16", asmName>;
+       VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<op, opName#"_fake16", asmName>;
 }
 
-multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> :
-  VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>;
+multiclass VOP1Only_Real_gfx11_gfx12_gfx13<bits<9> op> :
+  VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>,
+  VOP1Only_Real<GFX13Gen, op>;
 
 multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> :
   VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>;
 
-multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
+multiclass VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<bits<9> op, string opName,
+                                                    string asmName> :
+  VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>,
+  VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>,
+  VOP1_Real_e32_with_name<GFX13Gen, op, opName, asmName>;
+
+multiclass VOP1_Real_FULL_t16<GFXGen Gen, bits<9> op> :
+  VOP1_Real_FULL_with_name<Gen, op, NAME,
+                           !cast<VOP1_Pseudo>(!subst("_fake16", "", NAME)#"_e32").Mnemonic>;
+
+multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<
     bits<9> op, string asmName = !tolower(NAME), string opName = NAME> {
   defm opName#"_t16" :
-       VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_t16", asmName>;
+       VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_t16", asmName>,
+       VOP1_Real_FULL_with_name<GFX13Gen, op, opName#"_t16", asmName>;
   defm opName#"_fake16":
-       VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
+       VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>,
+       VOP1_Real_FULL_with_name<GFX13Gen, op, opName#"_fake16", asmName>;
 }
 
-multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_not_gfx1250<bits<9> op, string opName,
-                                                            string asmName> :
+multiclass VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13_not_gfx1250<bits<9> op, string opName,
+                                                                  string asmName> :
   VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
-  VOP1_Real_FULL_with_name<GFX12Not12_50Gen, op, opName, asmName>;
+  VOP1_Real_FULL_with_name<GFX12Not12_50Gen, op, opName, asmName>,
+  VOP1_Real_FULL_with_name<GFX13Gen, op, opName, asmName>;
 
-multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+multiclass VOP1_Real_OpSelIsDPP<GFXGen Gen, bits<9> op> : VOP1_Real_e32<Gen, op> {
    defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
-   def _e64_gfx1250 :
-        VOP3_Real_Gen<ps, GFX1250Gen>,
+   def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
         VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
 }
 
-defm V_CVT_F32_FP8           : VOP1_Real_FULL_with_name_gfx11_gfx12_not_gfx1250<0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+multiclass VOP1_Real_OpSelIsDPP_gfx1250_gfx13<bits<9> op> :
+  VOP1_Real_OpSelIsDPP<GFX1250Gen, op>,
+  VOP1_Real_OpSelIsDPP<GFX13Gen, op>;
+
+defm V_CVT_F32_FP8           : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13_not_gfx1250<0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
 defm V_CVT_F32_FP8           : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
 
-defm V_CVT_F32_BF8           : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
+defm V_CVT_F32_BF8           : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
 
-defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_FP8_t16    : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_FP8_t16    : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
-defm V_CVT_PK_F32_BF8_t16    : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
-defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
-defm V_CVT_PK_F32_BF8_t16    : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_t16    : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_t16    : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_t16    : VOP1_Real_e32_with_name_gfx11_gfx12_gfx13<0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_t16    : VOP3_Real_with_name_gfx11_gfx12_gfx13<0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
 
-defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
+defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x00c,
   "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
-defm V_CVT_FLOOR_I32_F32   : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d,
+defm V_CVT_FLOOR_I32_F32   : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x00d,
   "V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">;
-defm V_CLZ_I32_U32         : VOP1_Real_FULL_with_name_gfx11_gfx12<0x039,
+defm V_CLZ_I32_U32         : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x039,
   "V_FFBH_U32", "v_clz_i32_u32">;
-defm V_CTZ_I32_B32         : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a,
+defm V_CTZ_I32_B32         : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x03a,
   "V_FFBL_B32", "v_ctz_i32_b32">;
-defm V_CLS_I32             : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
+defm V_CLS_I32             : VOP1_Real_FULL_with_name_gfx11_gfx12_gfx13<0x03b,
   "V_FFBH_I32", "v_cls_i32">;
-defm V_SWAP_B16              : VOP1Only_Real_gfx11_gfx12<0x066>;
-defm V_PERMLANE64_B32        : VOP1Only_Real_gfx11_gfx12<0x067>;
-defm V_MOV_B16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
-defm V_NOT_B16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x069>;
-defm V_CVT_I32_I16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06a>;
-defm V_CVT_U32_U16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x06b>;
-
-defm V_CVT_F16_U16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x050>;
-defm V_CVT_F16_I16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x051>;
-defm V_CVT_U16_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x052>;
-defm V_CVT_I16_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x053>;
-defm V_RCP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
-defm V_RCP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
-defm V_SQRT_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
-defm V_SQRT_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
-defm V_RSQ_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
-defm V_RSQ_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
-defm V_LOG_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
-defm V_LOG_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
-defm V_EXP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
-defm V_EXP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
-defm V_FREXP_MANT_F16        : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x059>;
-defm V_FREXP_EXP_I16_F16     : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05a>;
-defm V_FLOOR_F16_t16         : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
-defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
-defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
-defm V_CEIL_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
-defm V_TRUNC_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05d>;
-defm V_RNDNE_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05e>;
-defm V_FRACT_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x05f>;
-defm V_SIN_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x060>;
-defm V_COS_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x061>;
-defm V_SAT_PK_U8_I16         : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x062>;
-defm V_CVT_NORM_I16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x063>;
-defm V_CVT_NORM_U16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064>;
-
-defm V_CVT_F16_F32           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00a>;
-defm V_CVT_F32_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
+defm V_SWAP_B16              : VOP1Only_Real_gfx11_gfx12_gfx13<0x066>;
+defm V_PERMLANE64_B32        : VOP1Only_Real_gfx11_gfx12_gfx13<0x067>;
+defm V_MOV_B16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x01c, "v_mov_b16">;
+defm V_NOT_B16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x069>;
+defm V_CVT_I32_I16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x06a>;
+defm V_CVT_U32_U16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x06b>;
+
+defm V_CVT_F16_U16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x050>;
+defm V_CVT_F16_I16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x051>;
+defm V_CVT_U16_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x052>;
+defm V_CVT_I16_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x053>;
+defm V_RCP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x054, "v_rcp_f16">;
+defm V_RCP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x054, "v_rcp_f16">;
+defm V_SQRT_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x055, "v_sqrt_f16">;
+defm V_SQRT_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x055, "v_sqrt_f16">;
+defm V_RSQ_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x056, "v_rsq_f16">;
+defm V_RSQ_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x056, "v_rsq_f16">;
+defm V_LOG_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x057, "v_log_f16">;
+defm V_LOG_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x057, "v_log_f16">;
+defm V_EXP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x058, "v_exp_f16">;
+defm V_EXP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x058, "v_exp_f16">;
+defm V_FREXP_MANT_F16        : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x059>;
+defm V_FREXP_EXP_I16_F16     : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05a>;
+defm V_FLOOR_F16_t16         : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05b, "v_floor_f16">;
+defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05b, "v_floor_f16">;
+defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05c, "v_ceil_f16">;
+defm V_CEIL_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12_gfx13<0x05c, "v_ceil_f16">;
+defm V_TRUNC_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05d>;
+defm V_RNDNE_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05e>;
+defm V_FRACT_F16             : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x05f>;
+defm V_SIN_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x060>;
+defm V_COS_F16               : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x061>;
+defm V_SAT_PK_U8_I16         : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x062>;
+defm V_CVT_NORM_I16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x063>;
+defm V_CVT_NORM_U16_F16      : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x064>;
+
+defm V_CVT_F16_F32           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x00a>;
+defm V_CVT_F32_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12_gfx13<0x00b>;
 
 defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
 
-defm V_TANH_F32              : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
-defm V_TANH_F16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
-defm V_PERMLANE16_SWAP_B32   : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
-defm V_TANH_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
-defm V_PRNG_B32              : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
-defm V_CVT_F32_BF16_gfx1250  : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">;
-defm V_SAT_PK4_I4_I8         : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
-defm V_SAT_PK4_U4_U8         : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
-defm V_CVT_PK_F16_FP8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
-defm V_CVT_PK_F16_BF8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
-defm V_CVT_F16_FP8           : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
-defm V_CVT_F16_BF8           : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
-defm V_RCP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
-defm V_SQRT_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
-defm V_RSQ_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
-defm V_LOG_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
-defm V_EXP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
-defm V_SIN_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
-defm V_COS_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
+defm V_TANH_F32              : VOP1_Real_FULL_gfx1250_gfx13<0x01e>;
+defm V_TANH_F16              : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x01f>;
+defm V_PERMLANE16_SWAP_B32   : VOP1_Real_OpSelIsDPP_gfx1250_gfx13<0x049>;
+defm V_TANH_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x04a>;
+defm V_PRNG_B32              : VOP1_Real_FULL_gfx1250_gfx13<0x04b>;
+defm V_CVT_F32_BF16_gfx1250  : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x072, "v_cvt_f32_bf16">;
+defm V_SAT_PK4_I4_I8         : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x073>;
+defm V_SAT_PK4_U4_U8         : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x074>;
+defm V_CVT_PK_F16_FP8        : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x075>;
+defm V_CVT_PK_F16_BF8        : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x076>;
+defm V_CVT_F16_FP8           : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x077>;
+defm V_CVT_F16_BF8           : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x078>;
+defm V_RCP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x079>;
+defm V_SQRT_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07a>;
+defm V_RSQ_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07b>;
+defm V_LOG_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07c>;
+defm V_EXP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07d>;
+defm V_SIN_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07e>;
+defm V_COS_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250_gfx13<0x07f>;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
@@ -1213,17 +1308,22 @@ multiclass VOP1_Real_gfx10_FULL_gfx11_gfx12<bits<9> op> :
   VOP1_Real_FULL<GFX11Gen, op>,
   VOP1_Real_FULL<GFX12Gen, op>;
 
-multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12_gfx13<bits<9> op> :
   VOP1_Real_gfx10<op>,
   VOP1_Real_NO_DPP<GFX11Gen, op>,
-  VOP1_Real_NO_DPP<GFX12Gen, op>;
+  VOP1_Real_NO_DPP<GFX12Gen, op>,
+  VOP1_Real_NO_DPP<GFX13Gen, op>;
 
 multiclass VOP1Only_Real_gfx10_gfx11_gfx12<bits<9> op> :
   VOP1Only_Real_gfx10<op>,
   VOP1Only_Real<GFX11Gen, op>,
   VOP1Only_Real<GFX12Gen, op>;
 
-defm V_PIPEFLUSH         : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<0x01b>;
+multiclass VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<bits<9> op> :
+  VOP1Only_Real_gfx10_gfx11_gfx12<op>,
+  VOP1Only_Real<GFX13Gen, op>;
+
+defm V_PIPEFLUSH         : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x01b>;
 defm V_MOVRELSD_2_B32    : VOP1_Real_gfx10_FULL_gfx11_gfx12<0x048>;
 defm V_CVT_F16_U16       : VOP1_Real_gfx10<0x050>;
 defm V_CVT_F16_I16       : VOP1_Real_gfx10<0x051>;
@@ -1247,7 +1347,7 @@ defm V_SAT_PK_U8_I16     : VOP1_Real_gfx10<0x062>;
 defm V_CVT_NORM_I16_F16  : VOP1_Real_gfx10<0x063>;
 defm V_CVT_NORM_U16_F16  : VOP1_Real_gfx10<0x064>;
 
-defm V_SWAP_B32          : VOP1Only_Real_gfx10_gfx11_gfx12<0x065>;
+defm V_SWAP_B32          : VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<0x065>;
 defm V_SWAPREL_B32       : VOP1Only_Real_gfx10_gfx11_gfx12<0x068>;
 
 //===----------------------------------------------------------------------===//
@@ -1270,20 +1370,20 @@ let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
 multiclass VOP1_Real_gfx7<bits<9> op> :
   VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>;
 
-multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<bits<9> op> :
   VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
-  VOP1_Real_with_DPP16<GFX12Gen, op>;
+  VOP1_Real_with_DPP16<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>;
 
 defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>;
 defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>;
 
-defm V_TRUNC_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x017>;
-defm V_CEIL_F64       : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x018>;
-defm V_RNDNE_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x019>;
-defm V_FLOOR_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x01a>;
+defm V_TRUNC_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x017>;
+defm V_CEIL_F64       : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x018>;
+defm V_RNDNE_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x019>;
+defm V_FLOOR_F64      : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x01a>;
 
 //===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10, GFX11, GFX12
+// GFX6, GFX7, GFX10, GFX11, GFX12, GFX13
 //===----------------------------------------------------------------------===//
 
 let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
@@ -1314,16 +1414,20 @@ multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<bits<9> op> :
   VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL<GFX11Gen, op>,
   VOP1_Real_FULL<GFX12Gen, op>;
 
-multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<bits<9> op> :
+  VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<op>,
+  VOP1_Real_FULL<GFX13Gen, op>;
+
+multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<bits<9> op> :
   VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
-  VOP1_Real_NO_DPP<GFX12Gen, op>;
+  VOP1_Real_NO_DPP<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>;
 
-multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<bits<9> op> :
+multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<bits<9> op> :
   VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
-  VOP1_Real_with_DPP16<GFX12Gen, op>;
+  VOP1_Real_with_DPP16<GFX12Gen, op>, VOP1_Real_NO_DPP<GFX13Gen, op>;
 
-multiclass VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<9> op> :
-  VOP1Only_Real_gfx6_gfx7<op>, VOP1Only_Real_gfx10_gfx11_gfx12<op>;
+multiclass VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<bits<9> op> :
+  VOP1Only_Real_gfx6_gfx7<op>, VOP1Only_Real_gfx10_gfx11_gfx12_gfx13<op>;
 
 defm V_LOG_CLAMP_F32     : VOP1_Real_gfx6_gfx7<0x026>;
 defm V_RCP_CLAMP_F32     : VOP1_Real_gfx6_gfx7<0x028>;
@@ -1333,59 +1437,63 @@ defm V_RSQ_LEGACY_F32    : VOP1_Real_gfx6_gfx7<0x02d>;
 defm V_RCP_CLAMP_F64     : VOP1_Real_gfx6_gfx7<0x030>;
 defm V_RSQ_CLAMP_F64     : VOP1_Real_gfx6_gfx7<0x032>;
 
-defm V_NOP               : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x000>;
-defm V_MOV_B32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x001>;
-defm V_READFIRSTLANE_B32 : VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>;
-defm V_CVT_I32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x003>;
-defm V_CVT_F64_I32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x004>;
-defm V_CVT_F32_I32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x005>;
-defm V_CVT_F32_U32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x006>;
-defm V_CVT_U32_F32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x007>;
-defm V_CVT_I32_F32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x008>;
+defm V_NOP               : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x000>;
+defm V_MOV_B32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x001>;
+defm V_READFIRSTLANE_B32 : VOP1Only_Real_gfx6_gfx7_gfx10_gfx11_gfx12_gfx13<0x002>;
+defm V_CVT_I32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x003>;
+defm V_CVT_F64_I32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x004>;
+defm V_CVT_F32_I32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x005>;
+defm V_CVT_F32_U32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x006>;
+defm V_CVT_U32_F32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x007>;
+defm V_CVT_I32_F32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x008>;
 defm V_CVT_F16_F32       : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
 defm V_CVT_F32_F16       : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
 defm V_CVT_RPI_I32_F32   : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
 defm V_CVT_FLR_I32_F32   : VOP1_Real_gfx6_gfx7_gfx10<0x00d>;
-defm V_CVT_OFF_F32_I4    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x00e>;
-defm V_CVT_F32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x00f>;
-defm V_CVT_F64_F32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x010>;
-defm V_CVT_F32_UBYTE0    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x011>;
-defm V_CVT_F32_UBYTE1    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x012>;
-defm V_CVT_F32_UBYTE2    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x013>;
-defm V_CVT_F32_UBYTE3    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x014>;
-defm V_CVT_U32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x015>;
-defm V_CVT_F64_U32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x016>;
-defm V_FRACT_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x020>;
-defm V_TRUNC_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x021>;
-defm V_CEIL_F32          : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x022>;
-defm V_RNDNE_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x023>;
-defm V_FLOOR_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x024>;
-defm V_EXP_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x025>;
-defm V_LOG_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x027>;
-defm V_RCP_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02a>;
-defm V_RCP_IFLAG_F32     : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02b>;
-defm V_RSQ_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02e>;
-defm V_RCP_F64           : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x02f>;
-defm V_RSQ_F64           : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x031>;
-defm V_SQRT_F32          : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x033>;
-defm V_SQRT_F64          : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x034>;
-defm V_SIN_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x035>;
-defm V_COS_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x036>;
-defm V_NOT_B32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x037>;
-defm V_BFREV_B32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x038>;
+defm V_CVT_OFF_F32_I4    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x00e>;
+defm V_CVT_F32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x00f>;
+defm V_CVT_F64_F32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x010>;
+defm V_CVT_F32_UBYTE0    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x011>;
+defm V_CVT_F32_UBYTE1    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x012>;
+defm V_CVT_F32_UBYTE2    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x013>;
+defm V_CVT_F32_UBYTE3    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x014>;
+defm V_CVT_U32_F64       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x015>;
+defm V_CVT_F64_U32       : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x016>;
+defm V_FRACT_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x020>;
+defm V_TRUNC_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x021>;
+defm V_CEIL_F32          : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x022>;
+defm V_RNDNE_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x023>;
+defm V_FLOOR_F32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x024>;
+defm V_EXP_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x025>;
+defm V_LOG_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x027>;
+defm V_RCP_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02a>;
+defm V_RCP_IFLAG_F32     : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02b>;
+defm V_RSQ_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x02e>;
+defm V_RCP_F64           : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x02f>;
+defm V_RSQ_F64           : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x031>;
+defm V_SQRT_F32          : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x033>;
+defm V_SQRT_F64          : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12_gfx13<0x034>;
+defm V_SIN_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x035>;
+defm V_COS_F32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x036>;
+defm V_NOT_B32           : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x037>;
+defm V_BFREV_B32         : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x038>;
 defm V_FFBH_U32          : VOP1_Real_gfx6_gfx7_gfx10<0x039>;
 defm V_FFBL_B32          : VOP1_Real_gfx6_gfx7_gfx10<0x03a>;
 defm V_FFBH_I32          : VOP1_Real_gfx6_gfx7_gfx10<0x03b>;
-defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03c>;
-defm V_FREXP_MANT_F64    : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03d>;
-defm V_FRACT_F64         : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_with_DPP16_gfx12<0x03e>;
-defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x03f>;
-defm V_FREXP_MANT_F32    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x040>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03c>;
+defm V_FREXP_MANT_F64    : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03d>;
+defm V_FRACT_F64         : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx13_with_DPP16_gfx12<0x03e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x03f>;
+defm V_FREXP_MANT_F32    : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12_gfx13<0x040>;
 defm V_CLREXCP           : VOP1_Real_gfx6_gfx7_gfx10<0x041>;
 defm V_MOVRELD_B32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x042>;
 defm V_MOVRELS_B32       : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x043>;
 defm V_MOVRELSD_B32      : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x044>;
 
+def : AMDGPUMnemonicAlias<"v_brev_b32", "v_bfrev_b32"> {
+  let AssemblerPredicate = isGFX13Plus;
+}
+
 //===----------------------------------------------------------------------===//
 // GFX8, GFX9 (VI).
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index d87d250..2ccf392 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -15,8 +15,8 @@ class VOP2e <bits<6> op, VOPProfile P> : Enc32 {
   bits<9> src0;
   bits<8> src1;
 
-  let Inst{8-0}   = !if(P.HasSrc0, src0, 0);
-  let Inst{16-9}  = !if(P.HasSrc1, src1, 0);
+  let Inst{8-0}   = !if(P.HasSrc0, src0, ?);
+  let Inst{16-9}  = !if(P.HasSrc1, src1, ?);
   let Inst{24-17} = !if(P.EmitDst, vdst, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0; //encoding
@@ -28,8 +28,8 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
   bits<8>  src1;
   bits<32> imm;
 
-  let Inst{8-0}   = !if(P.HasSrc0, src0, 0);
-  let Inst{16-9}  = !if(P.HasSrc1, src1, 0);
+  let Inst{8-0}   = !if(P.HasSrc0, src0, ?);
+  let Inst{16-9}  = !if(P.HasSrc1, src1, ?);
   let Inst{24-17} = !if(P.EmitDst, vdst, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0; // encoding
@@ -42,8 +42,8 @@ class VOP2_MADK64e <bits<6> op, VOPProfile P> : Enc96 {
   bits<8>  src1;
   bits<64> imm;
 
-  let Inst{8-0}   = !if(P.HasSrc0, src0, 0);
-  let Inst{16-9}  = !if(P.HasSrc1, src1, 0);
+  let Inst{8-0}   = !if(P.HasSrc0, src0, ?);
+  let Inst{16-9}  = !if(P.HasSrc1, src1, ?);
   let Inst{24-17} = !if(P.EmitDst, vdst, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0; // encoding
@@ -55,7 +55,7 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> src1;
 
   let Inst{8-0}   = 0xf9; // sdwa
-  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, ?);
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0; // encoding
@@ -66,11 +66,11 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
   bits<9> src1;
 
   let Inst{8-0}   = 0xf9; // sdwa
-  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, ?);
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0; // encoding
-  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+  let Inst{63}    = !if(P.HasSrc1, src1{8}, ?); // src1_sgpr
 }
 
 class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
@@ -971,7 +971,7 @@ defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_a
 } // End IsNeverUniform = 1
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>;
 
-let ReadsModeReg = 0, mayRaiseFPException = 0 in {
+let ReadsModeReg = 0, mayRaiseFPException = 0, SubtargetPredicate = HasCvtPkNormVOP2Insts in {
 defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>;
 defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>;
 }
@@ -1266,14 +1266,14 @@ let Constraints = "$vdst = $src2",
 defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">;
 } // End SubtargetPredicate = HasDLInsts
 
-let SubtargetPredicate = HasFmaLegacy32 in {
+let SubtargetPredicate = HasFmacLegacy32 in {
 
 let Constraints = "$vdst = $src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1 in
 defm V_FMAC_LEGACY_F32 : VOP2Inst <"v_fmac_legacy_f32", VOP_MAC_LEGACY_F32>;
 
-} // End SubtargetPredicate = HasFmaLegacy32
+} // End SubtargetPredicate = HasFmacLegacy32
 
 let SubtargetPredicate = HasFmacF64Inst,
     Constraints = "$vdst = $src2",
@@ -1348,10 +1348,15 @@ let isCommutable = 1 in
 def V_FMAAK_F64 : VOP2_Pseudo<"v_fmaak_f64", VOP_MADAK_F64, [], "">;
 } // End SubtargetPredicate = HasFmaakFmamkF64Insts, isReMaterializable = 1, FixedSize = 1, Size = 12, SchedRW = [Write64Bit]
 
+// A dedicated profile for V_PK_FMAC_F16.
+def VOP_V2F16_V2F16_V2F16_SPLAT : VOPProfile <[v2f16, v2f16, v2f16, untyped]> {
+  let Src0RC32 = VSrc_v2f16_splat;
+}
+
 let SubtargetPredicate = HasPkFmacF16Inst in {
 // FIXME: V_PK_FMAC_F16 is currently not used in instruction selection.
 // If this changes, ensure the DPP variant is not used for GFX11+.
-defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
+defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16_SPLAT>;
 } // End SubtargetPredicate = HasPkFmacF16Inst
 
 // Note: 16-bit instructions produce a 0 result in the high 16-bits
@@ -1481,7 +1486,7 @@ let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in {
 } // End SubtargetPredicate = isGFX12Plus, isReMaterializable = 1
 
 let SubtargetPredicate = HasIEEEMinimumMaximumInsts, isReMaterializable = 1,
-    SchedRW = [WriteDoubleAdd], isCommutable = 1 in {
+    SchedRW = [WriteDoubleAdd], isCommutable = 1, IsDPMACCInstruction = 1 in {
   defm V_MIN_NUM_F64 : VOP2Inst_VOPD <"v_min_num_f64", VOP_F64_F64_F64, 0x24, "v_min_num_f64", fminnum_like>;
   defm V_MAX_NUM_F64 : VOP2Inst_VOPD <"v_max_num_f64", VOP_F64_F64_F64, 0x23, "v_max_num_f64", fmaxnum_like>;
 }
@@ -1502,7 +1507,7 @@ class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
   bits<8> vdst;
   bits<8> src1;
   let Inst{8-0}   = 0xfa;
-  let Inst{16-9}  = !if(p.HasSrc1, src1{7-0}, 0);
+  let Inst{16-9}  = !if(p.HasSrc1, src1{7-0}, ?);
   let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0;
@@ -1544,7 +1549,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
   bits<8> src1;
 
   let Inst{8-0}   = fi;
-  let Inst{16-9}  = !if(p.HasSrc1, src1{7-0}, 0);
+  let Inst{16-9}  = !if(p.HasSrc1, src1{7-0}, ?);
   let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0;
@@ -2346,7 +2351,7 @@ class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
   bits<8> vdst;
   bits<8> src1;
   let Inst{8-0}   = 0xfa; //dpp
-  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, ?);
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0; //encoding
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42ec8ba..bdcf04f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -151,7 +151,7 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
 
 class VOP3_INTERP16 <list<ValueType> ArgVT, bit OpSel = 0> : VOPProfile<ArgVT> {
   let IsSingle = 1;
-  let HasOMod = !ne(DstVT.Value, f16.Value);
+  let HasOMod = !ne(DstVT, f16);
   let HasHigh = 1;
   let HasOpSel = OpSel;
 
@@ -185,7 +185,8 @@ defm V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32",
 defm V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 defm V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 defm V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>, VOPD_Component<0x13, "v_fma_f32">;
-defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+let SubtargetPredicate = HasLerpInst in
+  defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteIntMul] in {
   let SubtargetPredicate = HasMadU32Inst in
@@ -198,9 +199,11 @@ let SchedRW = [WriteIntMul] in {
 
 let SchedRW = [WriteDoubleAdd] in {
 let FPDPRounding = 1 in {
+let IsDPMACCInstruction = 1 in
 defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP_F64_F64_F64_F64_DPP_PROF, any_fma>, VOPD_Component<0x20, "v_fma_f64">;
 let SubtargetPredicate = isNotGFX12Plus in {
 defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>;
+let IsDPMACCInstruction = 1 in
 defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fmul>;
 } // End SubtargetPredicate = isNotGFX12Plus
 } // End FPDPRounding = 1
@@ -223,10 +226,10 @@ defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, f
 defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, fminimum>;
 defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, fmaximum>;
 
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1 in {
 defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
 defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaximum>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], IsDPMACCInstruction = 1
 } // End SubtargetPredicate = HasIEEEMinimumMaximumInsts, ReadsModeReg = 0, AddedComplexity = 1
 
 let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDoubleAdd] in {
@@ -251,19 +254,19 @@ defm V_DIV_FMAS_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f32", VOP_F32_F32_F32
 //   if (vcc)
 //     result *= 2^64
 //
-let SchedRW = [WriteDouble], FPDPRounding = 1 in
+let SchedRW = [WriteDouble], FPDPRounding = 1, IsDPMACCInstruction = 1 in
 defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>;
 } // End Uses = [MODE, VCC, EXEC]
 
 } // End isCommutable = 1
 
 let isReMaterializable = 1 in {
-let mayRaiseFPException = 0 in {
+let mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts in {
 defm V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
 defm V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
 defm V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
 defm V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
-} // End mayRaiseFPException
+} // mayRaiseFPException = 0, SubtargetPredicate = HasCubeInsts
 
 defm V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
 defm V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
@@ -306,20 +309,20 @@ let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in {
   defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
-let isCommutable = 1 in {
+let isCommutable = 1, SubtargetPredicate = HasSadInsts in {
   defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
   defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
   defm V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
   defm V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-} // End isCommutable = 1
+} // End isCommutable = 1, SubtargetPredicate = HasSadInsts
 defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
 
 defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>;
 
-let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1 in {
   defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP_F64_F64_F64_F64_DPP_PROF, AMDGPUdiv_fixup>;
   defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, any_fldexp>;
-} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1, IsDPMACCInstruction = 1
 } // End isReMaterializable = 1
 
 let SubtargetPredicate = isGFX9GFX10 in
@@ -357,7 +360,7 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
   defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32> ;
 
   // Double precision division pre-scale.
-  let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in
+  let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1, IsDPMACCInstruction = 1 in
   defm V_DIV_SCALE_F64 : VOP3Inst <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>;
 } // End mayRaiseFPException = 0
 
@@ -370,12 +373,12 @@ defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64
 
 
 let isReMaterializable = 1 in {
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDouble], IsDPMACCInstruction = 1 in {
 defm V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop>;
-} // End SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble], IsDPMACCInstruction = 1
 
 let SchedRW = [Write64Bit] in {
-  let SubtargetPredicate = isGFX6GFX7 in {
+  let SubtargetPredicate = isGFX6GFX7, IsDPMACCInstruction = 1 in {
   defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, cshl_64>;
   defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, csrl_64>;
   defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
@@ -424,15 +427,16 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
 
 let SubtargetPredicate = isGFX7Plus in {
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
-defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
+let SubtargetPredicate = HasQsadInsts in
+  defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
 } // End SubtargetPredicate = isGFX7Plus
 
 let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
   let SubtargetPredicate = isGFX7Plus in {
-    defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug]>;
-    defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug]>;
+    defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [NotHasMADIntraFwdBug]>;
+    defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [NotHasMADIntraFwdBug]>;
   }
   let SubtargetPredicate = isGFX11Only, OtherPredicates = [HasMADIntraFwdBug],
       Constraints = "@earlyclobber $vdst" in {
@@ -634,19 +638,13 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDA
 }
 
 def shl_0_to_4 : PatFrag<
-  (ops node:$src0, node:$src1), (shl node:$src0, node:$src1),
-  [{
-     if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-       return C->getZExtValue() <= 4;
-     }
-     return false;
-   }]> {
+  (ops node:$src0, node:$src1), (shl node:$src0, node:$src1), [{
+    KnownBits KB = CurDAG->computeKnownBits(N->getOperand(1));
+    return KB.getMaxValue().getZExtValue() <= 4;
+  }]> {
   let GISelPredicateCode = [{
-    int64_t Imm = 0;
-    if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) &&
-        !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm))))
-      return false;
-    return (uint64_t)Imm <= 4;
+    KnownBits KB = VT->getKnownBits(MI.getOperand(2).getReg());
+    return KB.getMaxValue().getZExtValue() <= 4;
   }];
 }
 
@@ -775,10 +773,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
-  defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>;
+  defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>;
+  defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>;
+  defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>;
+  defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>;
 }
 
 defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
@@ -789,9 +787,6 @@ let isCommutable = 1 in {
   defm V_MAD_I32_I16 : VOP3Inst_t16 <"v_mad_i32_i16", VOP_I32_I16_I16_I32>;
 } // End isCommutable = 1
 
-defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
-defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
-
 defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>;
 
 let isReMaterializable = 1 in {
@@ -820,13 +815,13 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
                                                         VOP3_CVT_PK_F8_F32_Profile_t16<>,
                                                         VOP3_CVT_PK_F8_F32_Profile_fake16<>>;
 
-    let SubtargetPredicate = isGFX12Plus in {
+    let SubtargetPredicate = isGFX11Plus in {
       let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
         defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
       let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in
         defm V_CVT_SR_FP8_F32_gfx1250 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx1250", VOP3_CVT_SR_F8_ByteSel_Profile<f32, true>>;
       defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
-    }
+    } // End SubtargetPredicate = isGFX11Plus
   }
 
   // These instructions have non-standard use of op_sel. In particular they are
@@ -930,7 +925,7 @@ let SubtargetPredicate = isGFX940Plus in {
   }
 }
 
-let SubtargetPredicate = isGFX12Plus in {
+let SubtargetPredicate = isGFX11Plus in {
   let OtherPredicates = [HasFP8ConversionInsts, NotHasFP8E5M3Insts] in
     def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
   let OtherPredicates = [HasFP8ConversionInsts, HasFP8E5M3Insts] in {
@@ -938,7 +933,7 @@ let SubtargetPredicate = isGFX12Plus in {
     def : Cvt_SR_F8_ByteSel_E5M3_Pat<int_amdgcn_cvt_sr_fp8_f32_e5m3, V_CVT_SR_FP8_F32_gfx1250_e64, f32, DSTCLAMP.ENABLE>;
   }
   def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
-}
+} // End SubtargetPredicate = isGFX11Plus
 }
 
 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
@@ -976,10 +971,10 @@ def : GCNPat <
 } // End SubtargetPredicate = HasLshlAddU64Inst
 
 let SubtargetPredicate = HasAddMinMaxInsts in {
-def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
-def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
-def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
-def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
 }
 
 def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
@@ -996,6 +991,11 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2),
 
 } // End SubtargetPredicate = isGFX9Plus
 
+let SubtargetPredicate = HasCvtPkNormVOP3Insts in {
+  defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
+  defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
+} // end SubtargetPredicate = HasCvtPkNormVOP3Insts
+
 // FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
 class OpSelBinOpClampPat<SDPatternOperator node,
                          Instruction inst> : GCNPat<
@@ -1061,7 +1061,7 @@ multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
 }
 
 // exclude pre-GFX9 where it was slow
-let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
+let OtherPredicates = [NotHasMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
   defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
   defm : IMAD32_Mul24_Pats<V_MAD_U64_U32_e64>;
 }
@@ -1717,6 +1717,28 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_MINMAX_I32     : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
   defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+
+  def : GCNPat<(v2i16 (build_vector (i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+                                                      (i16 (fp_to_sint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+               (V_CVT_PK_I16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+  def : GCNPat<(v2i16 (build_vector (i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+                                                      (i16 (fp_to_uint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+               (V_CVT_PK_U16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+
+  // Fallback patterns for f32->i16 conversion. These are only required because
+  // f32->i16 has to be legal so that we can select V_CVT_PK_[IU]16_F32 above.
+  let True16Predicate = UseRealTrue16Insts in {
+    def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+                 (EXTRACT_SUBREG (V_CVT_I32_F32_e64 $src0_modifiers, $src0), lo16)>;
+    def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+                 (EXTRACT_SUBREG (V_CVT_U32_F32_e64 $src0_modifiers, $src0), lo16)>;
+  }
+  let True16Predicate = NotUseRealTrue16Insts in {
+    def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+                 (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+    def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+                 (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+  }
 } // End SubtargetPredicate = isGFX11Plus
 
 class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 6500fce..9a4054b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
                      SDPatternOperator node = null_frag, bit IsDOT = 0> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           !if (P.HasModifiers,
-                               getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
+                               getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret,
                                getVOP3Pat<P, node>.ret)>;
   let SubtargetPredicate = isGFX11Plus in {
   if P.HasExtVOP3DPP then
@@ -182,6 +182,8 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
                              ValueType VT = f16> {
   defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
   defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
+  defvar OneImm = !if (!eq(VT, bf16), CONST.BF16_ONE, CONST.FP16_ONE);
+  defvar NegOneImm = !if (!eq(VT, bf16), CONST.BF16_NEG_ONE, CONST.FP16_NEG_ONE);
   // At least one of the operands needs to be an fpextend of an f16
   // for this to be worthwhile, so we need three patterns here.
   // TODO: Could we use a predicate to inspect src1/2/3 instead?
@@ -203,6 +205,34 @@ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
                    (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
     (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
               DSTCLAMP.NONE)>;
+
+  // (fadd x, y) -> (fma x, 1.0, y)
+  def : GCNPat <
+    (f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+               (f32 (VOP3PMadMixModsPat  f32:$src1, i32:$src1_mods)))),
+    (mix_inst $src0_mods, $src0, (i32 8), (i32 OneImm), $src1_mods, $src1,
+              DSTCLAMP.NONE)>;
+
+  // (fmul x, y) -> (fma x, y, -0.0)
+  def : GCNPat <
+    (f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+               (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+    (mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 SRCMODS.NEG), (i32 0),
+              DSTCLAMP.NONE)>;
+
+  // (fsub x, y) -> (fma y, -1.0, x)
+  def : GCNPat <
+    (f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+               (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+    (mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0,
+              DSTCLAMP.NONE)>;
+
+  // (fsub x, y) -> (fma y, -1.0, x)
+  def : GCNPat <
+    (f32 (fsub (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_mods)),
+               (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))),
+    (mix_inst $src1_mods, $src1, (i32 8), (i32 NegOneImm), $src0_mods, $src0,
+              DSTCLAMP.NONE)>;
 }
 
 multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
@@ -235,7 +265,7 @@ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
                        (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
     (mixlo_inst $src0_modifiers, $src0,
                 $src1_modifiers, $src1,
-                (i32 0), (i32 0),
+                (i32 SRCMODS.NONE), (i32 0),
                 DSTCLAMP.NONE,
                 (i32 (IMPLICIT_DEF)))
   >;
@@ -245,7 +275,7 @@ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
                                           (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
     (vecVT (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
-                       (i32 0), (i32 0),
+                       (i32 SRCMODS.NONE), (i32 0),
                        DSTCLAMP.NONE,
                        VGPR_32:$elt0))
   >;
@@ -299,7 +329,7 @@ multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like,
                        (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
     (mix_inst_16 $src0_modifiers, $src0,
                  $src1_modifiers, $src1,
-                 (i32 0), (i32 0),
+                 (i32 SRCMODS.NONE), (i32 0),
                  DSTCLAMP.NONE)
   >;
 
@@ -434,15 +464,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
 } // End SubtargetPredicate = HasFmaMixBF16Insts
 
 def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
-  let HasModifiers = 0;
+  let HasNeg = 0;
+  let EnableClamp = 1;
 }
 
 let isCommutable = 1, isReMaterializable = 1 in {
 let SubtargetPredicate = HasPkAddMinMaxInsts in {
-defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>;
 }
 let SubtargetPredicate = HasPkMinMax3Insts in {
 defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
@@ -463,10 +494,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
 >;
 
 let SubtargetPredicate = HasPkAddMinMaxInsts in {
-def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
-def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
-def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
-def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
 }
 
 let SubtargetPredicate = HasPkMinMax3Insts in {
@@ -662,7 +693,6 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
 multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
   defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>;
 
-  let SubtargetPredicate = isGFX12Plus in
   def : GCNPat <(intrinsic_node i32:$src0, i32:$src1,
                                 (VOP3Mods f32:$src2, i32:$src2_modifiers)),
                 (!cast<Instruction>(NAME) i32:$src0, i32:$src1,
@@ -995,6 +1025,7 @@ class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled =
   Instruction Opcode = !cast<Instruction>(NAME);
   bit is_dgemm = 0;
   bit is_gfx940_xdl = 0;
+  let isConvergent = 1;
   let PseudoInstr = NAME; // FIXME: Why is this not the default
 }
 
@@ -1032,7 +1063,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
   defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD");
 
 
-  let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
+  let mayRaiseFPException = 0, ReadsModeReg = 1 in {
     // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
     let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
       def _e64 : MAIInst<OpName, ProfileAGPR,
@@ -1059,7 +1090,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
                               MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">;
       }
     }
-  } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+  } // mayRaiseFPException = 0, ReadsModeReg = 1
 }
 
 // Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32
@@ -1363,16 +1394,10 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
   field bit is_wmma_xdl;
 }
 
-def WMMAOpcode : GenericEnum {
-  let FilterClass = "VOP3P_Pseudo";
-}
-
 class WMMAMappingTable : GenericTable {
   let FilterClass = "WMMAOpcodeMapping";
   let CppTypeName = "WMMAOpcodeMappingInfo";
   let Fields = ["Opcode2Addr", "Opcode3Addr"];
-  string TypeOf_Opcode2Addr = "WMMAOpcode";
-  string TypeOf_Opcode3Addr = "WMMAOpcode";
 }
 
 def WMMAOpcode2AddrMappingTable : WMMAMappingTable {
@@ -1401,13 +1426,13 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+  let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
       def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
     }
   }
   if convertibleTo3Addr then {
-    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+    let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
       let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
         def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
       }
@@ -1453,13 +1478,12 @@ let WaveSizePredicate = isWave64 in {
 }
 
 class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
-                        bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
+                        bit _IsIU, bit _IsFP8BF8, bit _Has_ImodOp = 0,
                         bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0,
                         bit _Scale16 = 0, bit _HasMatrixReuse = 0, bit _IsF4 = 0>
     : VOP3P_Profile<VOPProfile<ArgTy>> {
   bit IsIU = _IsIU;
-  bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
-  bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
+  bit NoABMods = !or(_IsFP8BF8, _IsF4); // No IMOD support for A and B
 
   int IndexType = _IndexType;
   let HasMatrixFMT = _HasMatrixFMT;
@@ -1468,7 +1492,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   let HasMatrixReuse = _HasMatrixReuse;
 
   bit HasIModOp = _Has_ImodOp;
-  let HasClamp = !and(IsIU, !not(HasIModOp));
+  let HasClamp = IsIU;
   let IsPacked = 1;
   let IsWMMA = !not(_IsSWMMAC);
   let IsSWMMAC = _IsSWMMAC;
@@ -1487,9 +1511,9 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
 
   bit NegLo01 = !not(NoABMods);
-  bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
+  bit NegLo2 = !and(!not(IsIU), IsWMMA);
   bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1]
-  bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
+  bit NegHi2 = !and(!not(IsIU), IsWMMA);
   bit NegLoAny = !or(NegLo01, NegLo2);
   bit NegHiAny = !or(NegHi01, NegHi2);
 
@@ -1520,8 +1544,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   // wmma f32_f32       | neg_lo for neg A/B     | neg_lo = 1  neg C(f32)
   //                    | neg_hi ignored         | neg_hi = 1  abs C(f32)
   // ---------------------------------------------------------------------------
-  // wmma f32_xf32      | not allowed for xf32   | not allowed
-  // ---------------------------------------------------------------------------
   // wmma f32_f16       | both neg_lo,neg_hi = 1 | neg_lo = 1  neg C(f32)
   // wmma f32_bf16      | neg A/B (f16 or bf16)  | neg_hi = 1  abs C(f32)
   // ---------------------------------------------------------------------------
@@ -1552,13 +1574,13 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
 
   // pseudo
 
-  // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+  // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
   // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
   // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
   // f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
   dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers));
   dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers));
-  dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
+  dag Src2Mods = !if(!or(IsIU, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
   dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
                        !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
                        !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
@@ -1573,7 +1595,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                                              MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
                                         (ins));
   dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
-  dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
+  dag ClampOp = !if(HasClamp, (ins Clamp:$clamp), (ins));
   dag Neg = !cond(!and(NegLoAny, NegHiAny)             : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
                   !and(NegLoAny, !not(NegHiAny))       : (ins neg_lo0:$neg_lo),
                   !and(!not(NegLoAny), !not(NegHiAny)) : (ins));
@@ -1585,7 +1607,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                                                  (ins VRegSrc_64:$src2),
                                                  (ins VRegSrc_32:$src2)),
                                             IndexKey)),
-                      MatrixScaleSrc, MatrixFMT, MatrixScale, MatrixReuse, Clamp, Neg);
+                      MatrixScaleSrc, ClampOp, MatrixFMT, MatrixScale, MatrixReuse, Neg);
 
   // asm
 
@@ -1635,22 +1657,21 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                          IsAB_BF16_IMod0    : (ins (i32 8), Src1VT:$src1),
                          IsIU               : (ins (VOP3PModsNeg $src1_modifiers), Src1VT:$src1),
                          NoABMods           : (ins Src1VT:$src1));
-  bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32));
+  bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU));
   bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp));
   bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp));
   bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp));
-  bit IsIUXF32 = !or(IsIU, IsXF32);
   dag Src2InPatWmma  = !cond(IsC_IMod1        : (ins timm:$src2_modifiers, Src2VT:$src2),
                              IsC_F32_IMod0    : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
                              IsC_F16_IMod0    : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
                              IsC_BF16_IMod0   : (ins Src2VT:$src2),
-                             IsIUXF32         : (ins Src2VT:$src2),
+                             IsIU             : (ins Src2VT:$src2),
                              IsSWMMAC         : (ins));
   dag Src2OutPatWmma = !cond(IsC_IMod1        : (ins (VOP3PModsNegAbs $src2_modifiers), Src2VT:$src2),
                              IsC_F32_IMod0    : (ins i32:$src2_modifiers, Src2VT:$src2),
                              IsC_F16_IMod0    : (ins i32:$src2_modifiers, Src2VT:$src2),
                              IsC_BF16_IMod0   : (ins (i32 8), Src2VT:$src2),
-                             IsIUXF32         : (ins Src2VT:$src2),
+                             IsIU             : (ins Src2VT:$src2),
                              IsSWMMAC         : (ins));
   dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
   dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
@@ -1663,7 +1684,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                           !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
   dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
   dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins timm:$src2_modifiers), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
-  dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1,  (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2));
+  dag Src2InlineOutPat = !con(!if(IsIU, (ins), !if(IsC_IMod1,  (ins (VOP3PModsNegAbs $src2_modifiers)), (ins (i32 8)))), (ins Src2VT:$src2));
   dag MatrixScaleInPat = !if(HasMatrixScale, (ins timm:$matrix_a_scale, timm:$matrix_a_scale_fmt, ScaleTy:$scale_src0,
                                                   timm:$matrix_b_scale, timm:$matrix_b_scale_fmt, ScaleTy:$scale_src1),
                                              (ins));
@@ -1674,17 +1695,17 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
 
   dag WmmaInPat  = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixScaleInPat, MatrixReuseInPat, ClampPat);
-  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, MatrixFMTOutPat,
-                        MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat);
+  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixScaleOutSrcPat, ClampPat, MatrixFMTOutPat,
+                        MatrixScaleOutModPat, MatrixReuseOutModPat);
 
   dag SwmmacInPat  = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
-  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
+  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat, MatrixReuseOutModPat);
 
   // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
   // can't use _twoaddr since it would violate src2 tied to vdst constraint.
   dag WmmaInlineInPat  = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixScaleInPat, MatrixReuseInPat, ClampPat);
-  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat,
-                              MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat, ClampPat);
+  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixScaleOutSrcPat, ClampPat,
+                              MatrixFMTOutPat, MatrixScaleOutModPat, MatrixReuseOutModPat);
 }
 
 def WMMAInstInfoTable : GenericTable {
@@ -1706,7 +1727,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
   defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
       def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
         let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1733,7 +1754,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
     let mayRaiseFPException = 0;
     let ReadsModeReg = 0;
     let AsmMatchConverter = "cvtSWMMAC";
-
+    let isConvergent = 1;
     let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
   }
 }
@@ -1756,84 +1777,126 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
 // Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
 // Original type for them is in comment on the right and refers to A and B.
 
-def F32_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
-def F32_BF16_WMMA_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
-def F16_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
-def BF16_BF16_WMMA_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
-def I32_IU8_WMMA_w32    : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
-def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32,   i32,   i32, v8i32], 0, 0, 1, 0>; // 8xi4
-def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
-def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
-
-def F32_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
-def F32_BF16_WMMA_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
-def F16_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
-def BF16_BF16_WMMA_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
-def I32_IU8_WMMA_w64    : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 4xi8
-def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
-def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32,   i32,   i32, v4f32], 0, 0, 0, 1>; // 4xf8
-def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 8xi4
-
-def F32_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
-def F32_BF16_SWMMAC_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
-def F16_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
-def BF16_BF16_SWMMAC_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
-def I32_IU8_SWMMAC_w32    : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
-def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32,   i32,  v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
-def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], 1,  0, 1, 0>; // 16xi4, 32xi4 **
-def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32,  v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
-
-def F32_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1,  8, 0, 0>;
-def F32_BF16_SWMMAC_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1,  8, 0, 0>;
-def F16_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1,  8, 0, 0>;
-def BF16_BF16_SWMMAC_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1,  8, 0, 0>;
-def I32_IU8_SWMMAC_w64    : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], 1,  8, 1, 0>; // 4xi8, 8xi8
-def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
-def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
-def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32,   i32, v2i32, v4f32], 1,  8, 0, 1>; // 4xf8, 8xf8
+def F32_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_WMMA_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_WMMA_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_WMMA_w32    : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi8
+def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32,   i32,   i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4
+def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 8xf8
+def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 16xi4
+
+def F32_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_WMMA_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_WMMA_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_WMMA_w64    : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 4xi8
+def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4 *
+def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32,   i32,   i32, v4f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 4xf8
+def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4
+
+def F32_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_SWMMAC_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_SWMMAC_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_SWMMAC_w32    : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi8, 16xi8
+def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32,   i32,  v2i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 16xi4
+def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/0,  /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 16xi4, 32xi4 **
+def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32,  v4i32, v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 8xf8, 16xf8
+
+def F32_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8,  /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F32_BF16_SWMMAC_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8,  /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def F16_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], /*_IsSWMMAC=*/1, /*_IndexType=*/8,  /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def BF16_BF16_SWMMAC_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], /*_IsSWMMAC=*/1, /*_IndexType=*/8,  /*_IsIU=*/0, /*_IsFP8BF8=*/0>;
+def I32_IU8_SWMMAC_w64    : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/8,  /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 4xi8, 8xi8
+def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 8xi4 ***
+def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/1, /*_IsFP8BF8=*/0>; // 8xi4, 16xi4
+def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32,   i32, v2i32, v4f32], /*_IsSWMMAC=*/1, /*_IndexType=*/8,  /*_IsIU=*/0, /*_IsFP8BF8=*/1>; // 4xf8, 8xf8
 
 // *   IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
 // **  IU4X64_SWMMAC_w32 index is i32, index_key is not used
 // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
 //                       for matrix A, index is i16; Matrix B uses all lanes
 
-def F32_F32_WMMA_w32             : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F32_BF16X32_WMMA_w32         : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F32_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F16_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def BF16_BF16X32_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def BF16F32_BF16_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 0, 0, 1>;
-def F32_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F32_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F16_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F16_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
-def F32_32X16X128_F4_WMMA_w32    : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
-def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
-def F32_32X16X128_F4_SCALE_w32   : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,  v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
-def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,  v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
-def F32_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def F32_BF16X64_SWMMAC_w32       : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def F16_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def BF16_BF16X64_SWMMAC_w32      : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
-def F32_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8i32,  v16i32, v8f32], 1, 32, 0, 1, 1, 0, 0, 0, 1>;
-def F16_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8i32,  v16i32, v8f16], 1, 32, 0, 1, 1, 0, 0, 0, 1>;
-def I32_IU8X128_SWMMAC_w32       : VOP3PWMMA_Profile<[v8i32, v8i32,  v16i32, v8i32], 1, 32, 1, 0, 1, 0, 0, 0, 1>;
-
-multiclass WMMA_F8F6F4_Profiles<bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> {
-  def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32,  v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32,  v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32,  v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32,  v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-  def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32,  v8i32,  v8f32], 0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
-}
-
-defm F32_16X16X128_F8F6F4         : WMMA_F8F6F4_Profiles<0, 0, 0>;
-defm F32_16X16X128_F8F6F4_SCALE   : WMMA_F8F6F4_Profiles<1, 0, 1>;
-defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<1, 1, 1>;
+def F32_F32_WMMA_w32             : VOP3PWMMA_Profile<[v8f32, v2f32,    v2f32,    v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_BF16X32_WMMA_w32         : VOP3PWMMA_Profile<[v8f32, v16bf16,  v16bf16,  v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f32, v16f16,   v16f16,   v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f16, v16f16,   v16f16,   v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def BF16_BF16X32_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def BF16F32_BF16_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16,  v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f32, v8i32,    v8i32,    v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f32, v16i32,   v16i32,   v8f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f16, v8i32,    v8i32,    v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f16, v16i32,   v16i32,   v8f16], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_32X16X128_F4_WMMA_w32    : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,   v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/0, /*_IsF4=*/1>;
+def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32,    v8i32,    v8i32], /*_IsSWMMAC=*/0, /*_IndexType=*/0, /*_IsIU=*/1, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_32X16X128_F4_SCALE_w32   : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,   v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0,  /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/1, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32,  v8i32,   v16f32], /*_IsSWMMAC=*/0, /*_IndexType=*/0,  /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/1, /*_Scale16=*/1, /*_HasMatrixReuse=*/1>;
+def F32_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f32, v16f16,   v32f16,   v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_BF16X64_SWMMAC_w32       : VOP3PWMMA_Profile<[v8f32, v16bf16,  v32bf16,  v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f16, v16f16,   v32f16,   v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def BF16_BF16X64_SWMMAC_w32      : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], /*_IsSWMMAC=*/1, /*_IndexType=*/16, /*_IsIU=*/0, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F32_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8i32,    v16i32,   v8f32], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def F16_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8i32,    v16i32,   v8f16], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/0, /*_IsFP8BF8=*/1,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+def I32_IU8X128_SWMMAC_w32       : VOP3PWMMA_Profile<[v8i32, v8i32,    v16i32,   v8i32], /*_IsSWMMAC=*/1, /*_IndexType=*/32, /*_IsIU=*/1, /*_IsFP8BF8=*/0,
+                                     /*_Has_ImodOp=*/1, /*_HasMatrixFMT=*/0, /*_HasMatrixScale=*/0, /*_Scale16=*/0, /*_HasMatrixReuse=*/1>;
+
+// Helper class to compute the destination vector type of WMMA_F8F6F4 instructions based on element type and dimensions.
+class getWMMAF8F6F4DstVTy<ValueType DstEltTy, int M, int N> {
+  // Size in bits = (M * N / 32) * element_size_in_bits
+  defvar Size = !mul(!div(!mul(M, N), 32), DstEltTy.Size);
+  ValueType ret = !cond(!eq(Size, 256)  : v8f32,
+                        !eq(Size, 1024) : v64f16);
+}
+
+// Helper class to compute the type of matrix A and B of WMMA_F8F6F4 instructions based on format and dimensions.
+class getWMMAF8F6F4ABVTy<string Fmt, int D1, int D2> {
+  defvar FmtBits = !cond(!eq(Fmt, "f8") : 8,
+                         !eq(Fmt, "f6") : 6,
+                         !eq(Fmt, "f4") : 4);
+  // TypeSize in bits = (D1 * D2 / 32) * format_bits
+  defvar TypeSize = !mul(!div(!mul(D1, D2), 32), FmtBits);
+  ValueType ret = !cond(!eq(TypeSize, 256)  : v8i32,
+                        !eq(TypeSize, 384)  : v12i32,
+                        !eq(TypeSize, 512)  : v16i32,
+                        !eq(TypeSize, 1024) : v32i32);
+}
+
+multiclass WMMA_F8F6F4_Profiles<ValueType DstEltTy, int M, int N, int K,
+                                bit HasMatrixScale, bit Scale16, bit HasMatrixReuse> {
+  defvar DstTy = getWMMAF8F6F4DstVTy<DstEltTy, M, N>.ret;
+  foreach ATy = ["f8", "f6", "f4"] in {
+    foreach BTy = ["f8", "f6", "f4"] in {
+      def _#ATy#_#BTy#_w32 : VOP3PWMMA_Profile<
+        [DstTy, getWMMAF8F6F4ABVTy<ATy, M, K>.ret, getWMMAF8F6F4ABVTy<BTy, K, N>.ret, DstTy],
+        0, 0, 0, 1, 1, 1, HasMatrixScale, Scale16, HasMatrixReuse>;
+    }
+  }
+}
+
+defm F32_16X16X128_F8F6F4         : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/0, /*Scale16=*/0, /*HasMatrixReuse=*/0>;
+defm F32_16X16X128_F8F6F4_SCALE   : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/1, /*Scale16=*/0, /*HasMatrixReuse=*/1>;
+defm F32_16X16X128_F8F6F4_SCALE16 : WMMA_F8F6F4_Profiles<f32, /*M=*/16, /*N=*/16, /*K=*/128, /*HasMatrixScale=*/1, /*Scale16=*/1, /*HasMatrixReuse=*/1>;
 
 class VOP_WMMA_LD_SCALE<ValueType vt, RegisterOperand RC> : VOP3P_Profile<VOPProfile<[untyped, vt, vt, untyped]>> {
   let HasMatrixScale = 1;
@@ -1905,8 +1968,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : WMMAInstGFX12<"v_wmma_scale_f32_32x16
 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
 } // End is_wmma_xdl = 1.
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+let isConvergent = 1 in {
+  defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+  defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+}
 } // End SubtargetPredicate = isGFX125xOnly
 } // End WaveSizePredicate = isWave32
 
@@ -2182,20 +2247,23 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
   }
 }
 
-multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> {
+multiclass VOP3P_Real_WMMA_F8F6F4<string Gen, bits<8> op, VOP3PWMMA_Profile WMMAP> {
   defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
   defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
   defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
-  let AsmString = asmName # PS.AsmOperands in
-    defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
-                MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">;
+  let AsmString = asmName # PS.AsmOperands in {
+    if !eq(Gen, "gfx1250") then {
+      defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
+                  MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_" # Gen>;
+    }
+  }
 }
 
-multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
-  defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+multiclass VOP3P_Real_WMMA_SrcFormats<string Gen, bits<8> op, string WMMAP> {
+  defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4<Gen, op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
   foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
     let isAsmParserOnly = true in { // Disable ambiguous disassembly.
-      defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+      defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4<Gen, op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
     }
   }
 }
@@ -2215,7 +2283,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
   let Inst{23-16} = LdScaleOp;
   let Inst{40-32} = scale_src0;
   let Inst{49-41} = scale_src1;
-  let Inst{58-50} = 0; // scale src2
+  let Inst{58-50} = ?; // scale src2
   let Inst{59}    = matrix_b_scale{0}; // scale_op_sel_hi(0)
   let Inst{60}    = 0;                 // scale_op_sel_hi(1)
   let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo)
@@ -2234,9 +2302,9 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
 
   let Inst{87-80} = op;
   let Inst{95-88} = 0xcc; //encoding
-  let Inst{104-96} = !if(P.HasSrc0, src0, 0);
-  let Inst{113-105} = !if(P.HasSrc1, src1, 0);
-  let Inst{122-114} = !if(P.HasSrc2, src2, 0);
+  let Inst{104-96} = !if(P.HasSrc0, src0, ?);
+  let Inst{113-105} = !if(P.HasSrc1, src1, ?);
+  let Inst{122-114} = !if(P.HasSrc2, src2, ?);
 
   // neg_lo
   let Inst{125} = !if(P.NegLo01, src0_modifiers{0}, 0);
@@ -2244,34 +2312,35 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
   let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
 }
 
-multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
-   defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
-   let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
-       DecoderNamespace = "GFX1250" in {
+multiclass VOP3PX2_Real_ScaledWMMA_F4<string Gen, bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+  defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+  if !eq(Gen, "gfx1250") then {
     def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>,
-                   VOP3PX2e <op, LdScaleOp, WMMAP>;
+                   VOP3PX2e <op, LdScaleOp, WMMAP> {
+      let PostEncoderMethod = "postEncodeVOP3<true, true, false>";
+    }
   }
 }
 
-multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+multiclass VOP3PX2_Real_ScaledWMMA<string Gen, bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
   defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
   defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
   defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
-  let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
-      DecoderNamespace = "GFX1250" in {
+  if !eq(Gen, "gfx1250") then {
     def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, asmName>,
                    VOP3PX2e <op, LdScaleOp, WMMAP>,
-                   MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_gfx1250"> {
+                   MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_" # Gen> {
       let AsmString = asmName # PS.AsmOperands;
+      let PostEncoderMethod = "postEncodeVOP3<true, true, false>";
     }
   }
 }
 
-multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<bits<8> op, bits<8> LdScaleOp, string WMMAP> {
-  defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+multiclass VOP3PX2_Real_ScaledWMMA_SrcFormats<string Gen, bits<8> op, bits<8> LdScaleOp, string WMMAP> {
+  defm _f8_f8_w32 : VOP3PX2_Real_ScaledWMMA<Gen, op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
   foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
     let isAsmParserOnly = true in { // Disable ambiguous disassembly.
-      defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+      defm _#I#_w32 : VOP3PX2_Real_ScaledWMMA<Gen, op, LdScaleOp, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
     }
   }
 }
@@ -2350,12 +2419,14 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B
 defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
 defm V_WMMA_F32_32X16X128_F4_w32      : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
 
-defm V_WMMA_F32_16X16X128_F8F6F4        : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
-defm V_WMMA_SCALE_F32_16X16X128_F8F6F4   : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
-defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
+defm V_WMMA_F32_16X16X128_F8F6F4         : VOP3P_Real_WMMA_SrcFormats <"gfx1250", 0x033, "F32_16X16X128_F8F6F4">;
+defm V_WMMA_SCALE_F32_16X16X128_F8F6F4   : VOP3PX2_Real_ScaledWMMA_SrcFormats <"gfx1250", 0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
+defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats <"gfx1250", 0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
 
-defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
-defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : VOP3PX2_Real_ScaledWMMA_F4 <"gfx1250", 0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4 <"gfx1250", 0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
+} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250"
 
 defm V_SWMMAC_F32_16X16X64_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
 defm V_SWMMAC_F32_16X16X64_BF16_w32     : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
@@ -2417,6 +2488,11 @@ multiclass VOP3P_Realtriple<GFXGen Gen, bits<8> op, string backing_ps_name = NAM
 multiclass VOP3P_Realtriple_gfx11_gfx12<bits<8> op>
   : VOP3P_Realtriple<GFX11Gen, op>, VOP3P_Realtriple<GFX12Gen, op>;
 
+defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple_gfx11_gfx12<0x24>;
+defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple_gfx11_gfx12<0x25>;
+defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple_gfx11_gfx12<0x26>;
+defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple_gfx11_gfx12<0x27>;
+
 //===----------------------------------------------------------------------===//
 // GFX12
 //===----------------------------------------------------------------------===//
@@ -2459,8 +2535,10 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
 defm V_FMA_MIXLO_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
 defm V_FMA_MIXHI_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3P_Real_gfx1250<0x35>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+let PostEncoderMethod = "postEncodeVOP3<true, true, false>" in {
+  defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3P_Real_gfx1250<0x35>;
+  defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+}
 
 let AssemblerPredicate = isGFX1250Plus in
 def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16",  "v_fma_mix_f32">;
@@ -2468,10 +2546,6 @@ def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16",  "v_fma_mix_f32">;
 defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
 defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
 
-defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>;
-defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>;
-defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>;
-defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>;
 
 //===----------------------------------------------------------------------===//
 // GFX11
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 2730ec5..989181b 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -24,7 +24,7 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> src1;
 
   let Inst{8-0}   = 0xf9; // sdwa
-  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, ?);
   let Inst{24-17} = op;
   let Inst{31-25} = 0x3e; // encoding
 }
@@ -33,10 +33,10 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
   bits<9> src1;
 
   let Inst{8-0}   = 0xf9; // sdwa
-  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, ?);
   let Inst{24-17} = op;
   let Inst{31-25} = 0x3e; // encoding
-  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+  let Inst{63}    = !if(P.HasSrc1, src1{8}, ?); // src1_sgpr
 }
 
 
@@ -422,7 +422,6 @@ multiclass VOPC_Pseudos <string opName,
 
 }
 
-let SubtargetPredicate = HasSdstCMPX in {
 multiclass VOPCX_Pseudos <string opName,
                           VOPC_Profile P, VOPC_Profile P_NoSDst,
                           SDPatternOperator cond = COND_NULL,
@@ -486,7 +485,6 @@ multiclass VOPCX_Pseudos <string opName,
     }
   } // end SubtargetPredicate = isGFX11Plus
 }
-} // End SubtargetPredicate = HasSdstCMPX
 
 defm VOPC_I1_F16_F16 : VOPC_Profile_t16<[Write32Bit], f16>;
 def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
@@ -518,8 +516,10 @@ multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
 multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
 
-multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
-  VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+let IsDPMACCInstruction = 1 in {
+  multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+    VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+}
 
 multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
                      string revOp = opName> {
@@ -537,9 +537,10 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
 multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
 
-multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
-  VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
-
+let IsDPMACCInstruction = 1 in {
+  multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+    VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+}
 
 multiclass VOPCX_F16<string opName, string revOp = opName> {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -556,8 +557,10 @@ multiclass VOPCX_F16<string opName, string revOp = opName> {
 multiclass VOPCX_F32 <string opName, string revOp = opName> :
   VOPCX_Pseudos <opName, VOPC_I1_F32_F32, VOPC_F32_F32, COND_NULL, revOp>;
 
-multiclass VOPCX_F64 <string opName, string revOp = opName> :
-  VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
+let IsDPMACCInstruction = 1 in {
+  multiclass VOPCX_F64 <string opName, string revOp = opName> :
+    VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
+}
 
 multiclass VOPCX_I16<string opName, string revOp = opName> {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -574,8 +577,10 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
 multiclass VOPCX_I32 <string opName, string revOp = opName> :
   VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
 
-multiclass VOPCX_I64 <string opName, string revOp = opName> :
-  VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+let IsDPMACCInstruction = 1 in {
+  multiclass VOPCX_I64 <string opName, string revOp = opName> :
+    VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -1114,7 +1119,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
   } // end SubtargetPredicate = isGFX11Plus
 }
 
-let SubtargetPredicate = HasSdstCMPX in {
 multiclass VOPCX_Class_Pseudos <string opName,
                                 VOPC_Profile P,
                                 VOPC_Profile P_NoSDst> :
@@ -1164,7 +1168,6 @@ multiclass VOPCX_Class_Pseudos <string opName,
     }
   } // end SubtargetPredicate = isGFX11Plus
 }
-} // End SubtargetPredicate = HasSdstCMPX
 } // End ReadsModeReg = 0, mayRaiseFPException = 0
 
 defm VOPC_I1_F16_I16 : VOPC_Class_Profile_t16<[Write32Bit]>;
@@ -1210,11 +1213,13 @@ multiclass VOPC_CLASS_F32 <string opName> {
 multiclass VOPCX_CLASS_F32 <string opName> :
   VOPCX_Class_Pseudos <opName, VOPC_I1_F32_I32, VOPC_F32_I32>;
 
+// FIXME: let IsDPMACCInstruction = 1 in
 multiclass VOPC_CLASS_F64 <string opName> {
   defm NAME : VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
   defm : VOPCClassPat64<NAME>;
 }
 
+// FIXME: let IsDPMACCInstruction = 1 in
 multiclass VOPCX_CLASS_F64 <string opName> :
   VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
 
@@ -1233,18 +1238,12 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
 // We need to use COPY_TO_REGCLASS to w/a the problem when ReplaceAllUsesWith()
 // complaints it cannot replace i1 <-> i64/i32 if node was not morphed in place.
 multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt, dag dstInst = (inst $src0, $src1)> {
-  let WaveSizePredicate = isWave64 in
   def : GCNPat <
-    (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
-    (i64 (COPY_TO_REGCLASS dstInst, SReg_64))
+    (WaveSizeVT (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+    dstInst
   >;
 
   let WaveSizePredicate = isWave32 in {
-    def : GCNPat <
-      (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
-      (i32 (COPY_TO_REGCLASS dstInst, SReg_32))
-    >;
-
     // Support codegen of i64 setcc in wave32 mode.
     def : GCNPat <
       (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
@@ -1459,7 +1458,7 @@ class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P>
 
   let Inst{8-0} = 0xfa;
 
-  let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+  let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, ?);
   let Inst{48-40} = dpp_ctrl;
   let Inst{50} = fi;
   let Inst{51} = bound_ctrl;
@@ -1485,7 +1484,7 @@ class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P>
 
   let Inst{8-0} = fi;
 
-  let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0);
+  let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, ?);
   let Inst{63-40} = dpp8{23-0};
 
   let AsmMatchConverter = "cvtDPP8";
@@ -1535,6 +1534,8 @@ class VOPC64_DPP<VOP_DPP_Pseudo ps, string opName = ps.OpName>
   let Uses = ps.Uses;
   let OtherPredicates = ps.OtherPredicates;
   let Constraints = ps.Constraints;
+
+  let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX");
 }
 
 class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps,
@@ -1575,6 +1576,8 @@ class VOPC64_DPP8<VOP_Pseudo ps, string opName = ps.OpName>
   let Uses = ps.Uses;
   let OtherPredicates = ps.OtherPredicates;
   let True16Predicate = ps.True16Predicate;
+
+  let PostEncoderMethod = !if(!empty(ps.Defs), "", "postEncodeVOPCX");
 }
 
 class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1777,6 +1780,7 @@ multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
         let Inst{7-0} = ?; // sdst
         let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
                         # "{_e64} " # ps64.AsmOperands;
+        let PostEncoderMethod = "postEncodeVOPCX";
     }
 
     defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
@@ -1838,6 +1842,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
           let Inst{7-0} = ?; // sdst
           let Inst{14} = 0;
           let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+          let PostEncoderMethod = "postEncodeVOPCX";
         }
       } else {
         def _e64#Gen.Suffix
@@ -1845,6 +1850,7 @@ multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
               VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
           let Inst{7-0} = ?; // sdst
           let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+          let PostEncoderMethod = "postEncodeVOPCX";
         }
       }
 
@@ -2186,6 +2192,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
         let Inst{7-0} = ?; // sdst
         let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
                         # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+        let PostEncoderMethod = "postEncodeVOPCX";
     }
 
     if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8325c62..09fdb00 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -18,6 +18,7 @@ class LetDummies {
   bit isConvergent;
   bit isAsCheapAsAMove;
   bit FPDPRounding;
+  bit IsDPMACCInstruction;
   Predicate SubtargetPredicate;
   string Constraints;
   string DisableEncoding;
@@ -71,6 +72,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
   string Mnemonic = opName;
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsTrue16 = P.IsTrue16;
+  bit IsDPMACCInstruction = 0;
   VOPProfile Pfl = P;
 
   string AsmOperands;
@@ -166,6 +168,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
 class VOP_Real<VOP_Pseudo ps> {
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsSingle = ps.Pfl.IsSingle;
+  bit IsDPMACCInstruction = ps.IsDPMACCInstruction;
 }
 
 class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -198,6 +201,8 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
   let isConvergent         = ps.isConvergent;
 
   VOPProfile Pfl = ps.Pfl;
+
+  let PostEncoderMethod = !if(!and(Pfl.HasSrc0, Pfl.HasSrc1, Pfl.HasSrc2), "", "postEncodeVOP3<"#Pfl.HasSrc0#","#Pfl.HasSrc1#","#Pfl.HasSrc2#">");
 }
 
 class VOP3_Real_Gen <VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> :
@@ -238,9 +243,9 @@ class VOP3a<VOPProfile P> : Enc64 {
   let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
 
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
-  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{40-32} = !if(P.HasSrc0, src0, ?);
+  let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2, ?);
   let Inst{60-59} = !if(P.HasOMod, omod, 0);
   let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
   let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
@@ -273,9 +278,9 @@ class VOP3a_t16<VOPProfile P> : Enc64 {
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
   let Inst{31-26} = 0x35;
-  let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
-  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+  let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, ?);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?);
   let Inst{60-59} = !if(P.HasOMod, omod, 0);
   let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
   let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
@@ -457,9 +462,9 @@ class VOP3be <VOPProfile P> : Enc64 {
   let Inst{7-0}   = vdst;
   let Inst{14-8}  = sdst;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
-  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{40-32} = !if(P.HasSrc0, src0, ?);
+  let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2, ?);
   let Inst{60-59} = !if(P.HasOMod, omod, 0);
   let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
   let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
@@ -509,9 +514,9 @@ class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base {
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
-  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
-  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{40-32} = !if(P.HasSrc0, src0, ?);
+  let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2, ?);
   let Inst{59}    = !cond(!and(P.HasSrc0, P.HasOpSel) : src0_modifiers{3},
                           P.IsDOT : 1,
                           P.HasMatrixScale : matrix_b_scale{0},
@@ -546,12 +551,12 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64, VOP3Pe_MAI_
 
   let Inst{22-16} = op;
   let Inst{31-23} = 0x1a7; //encoding
-  let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
-  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, ?);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2, ?);
 
-  let Inst{59}    = !if(P.HasSrc0, src0{9}, 0); // acc(0)
-  let Inst{60}    = !if(P.HasSrc1, src1{9}, 0); // acc(1)
+  let Inst{59}    = !if(P.HasSrc0, src0{9}, ?); // acc(0)
+  let Inst{60}    = !if(P.HasSrc1, src1{9}, ?); // acc(1)
 
   let Inst{63-61} = !if(P.HasSrc1, blgp, 0);
 }
@@ -631,12 +636,12 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
 
   let Inst{86-80} = op;
   let Inst{95-87} = 0x1a7; //encoding
-  let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, 0);
-  let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, 0);
-  let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, 0);
+  let Inst{104-96} = !if(MFMAPfl.HasSrc0, src0{8-0}, ?);
+  let Inst{113-105} = !if(MFMAPfl.HasSrc1, src1{8-0}, ?);
+  let Inst{122-114} = !if(MFMAPfl.HasSrc2, src2, ?);
 
-  let Inst{123}    = !if(MFMAPfl.HasSrc0, src0{9}, 0); // acc(0)
-  let Inst{124}    = !if(MFMAPfl.HasSrc1, src1{9}, 0); // acc(1)
+  let Inst{123}    = !if(MFMAPfl.HasSrc0, src0{9}, ?); // acc(0)
+  let Inst{124}    = !if(MFMAPfl.HasSrc1, src1{9}, ?); // acc(1)
 
   let Inst{127-125} = !if(MFMAPfl.HasSrc1, blgp, 0);
 }
@@ -698,7 +703,7 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   bits<2> dst_unused;
   bits<1> clamp;
 
-  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{42-40} = !if(P.EmitDstSel, dst_sel{2-0}, ?);
   let Inst{44-43} = !if(P.EmitDstSel, dst_unused{1-0}, ?);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
@@ -732,11 +737,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 {
   bits<5> src1_modifiers;
   bits<1> src1_sgpr;
 
-  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0);
   let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{4}, 0);
   let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
-  let Inst{55}    = !if(P.HasSrc0, src0{8}, 0);
+  let Inst{55}    = !if(P.HasSrc0, src0{8}, ?);
   let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0);
   let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{4}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
@@ -765,16 +770,9 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
 }
 
 class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
-  InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> {
-
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
+  VOP_Pseudo <opName, "_sdwa", P, P.OutsSDWA, P.InsSDWA, "", pattern> {
 
-  string Mnemonic = opName;
-  string AsmOperands = P.AsmSDWA;
+  let AsmOperands = P.AsmSDWA;
   string AsmOperands9 = P.AsmSDWA9;
 
   let Size = 8;
@@ -794,8 +792,6 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA,
                                          AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "GFX8";
-
-  VOPProfile Pfl = P;
 }
 
 class VOP_SDWA8_Real <VOP_SDWA_Pseudo ps> :
@@ -889,7 +885,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
   bits<4> row_mask;
   bit     fi;
 
-  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{48-40} = dpp_ctrl;
   let Inst{50}    = !if(IsDPP16, fi, ?);
   let Inst{51}    = bound_ctrl;
@@ -954,8 +950,8 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P>
   bits<9> src2;
 
   let Inst{7-0}   = !if(P.EmitDst, vdst{7-0}, 0);
-  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2, ?);
 }
 
 class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> {
@@ -964,8 +960,8 @@ class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op
   bits<11> src2;
 
   let Inst{7-0}   = !if(P.EmitDst, vdst{7-0}, 0);
-  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?);
 }
 
 class VOP3P_DPPe_Common_Base<bits<8> op, VOPProfile P> : Enc96 {
@@ -998,8 +994,8 @@ class VOP3P_DPPe_Common<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
   bits<9> src2;
 
   let Inst{7-0} = vdst;
-  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2, ?);
 }
 
 class VOP3P_DPPe_Common_t16<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
@@ -1008,8 +1004,8 @@ class VOP3P_DPPe_Common_t16<bits<8> op, VOPProfile P> : VOP3P_DPPe_Common_Base<o
   bits<11> src2;
 
   let Inst{7-0} = vdst{7-0};
-  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
-  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, ?);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, ?);
 }
 
 class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
@@ -1134,7 +1130,7 @@ class VOP3_DPP_Enc <bits<10> op, VOPProfile P, bit IsDPP16> :
   VOP3_DPPe_Fields {
 
   let Inst{40-32} = 0xfa;
-  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{80-72} = dpp_ctrl;
   let Inst{82}    = !if(IsDPP16, fi, ?);
   let Inst{83}    = bound_ctrl;
@@ -1154,7 +1150,7 @@ class VOP3_DPP_Enc_t16<bits<10> op, VOPProfile P, bit IsDPP16 >
       VOP3_DPPe_Fields_t16 {
 
   let Inst{40-32} = 0xfa;
-  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{80-72} = dpp_ctrl;
   let Inst{82}    = !if(IsDPP16, fi, ?);
   let Inst{83}    = bound_ctrl;
@@ -1180,7 +1176,7 @@ class VOP3P_DPP <bits<8> op, string OpName, VOPProfile P, bit IsDPP16,
   let VOP3P = 1;
 
   let Inst{40-32} = 0xfa;
-  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{80-72} = dpp_ctrl;
   let Inst{82}    = !if(IsDPP16, fi, ?);
   let Inst{83}    = bound_ctrl;
@@ -1195,7 +1191,7 @@ class VOP_DPP8e<VOPProfile P> : Enc64 {
   bits<24> dpp8;
   bits<9> fi;
 
-  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{63-40} = dpp8{23-0};
 }
 
@@ -1246,7 +1242,7 @@ class VOP3_DPP8_Enc <bits<10> op, VOPProfile P> :
   VOP3_DPPe_Common<op, P>,
   VOP3_DPP8e_Fields {
   let Inst{40-32} = fi;
-  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{95-72} = dpp8{23-0};
 }
 
@@ -1257,7 +1253,7 @@ class VOP3_DPP8_Enc_t16 <bits<10> op, VOPProfile P> :
   VOP3_DPPe_Common_t16<op, P>,
   VOP3_DPP8e_Fields_t16 {
   let Inst{40-32} = fi;
-  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{95-72} = dpp8{23-0};
 }
 
@@ -1270,7 +1266,7 @@ class VOP3P_DPP8<bits<8> op, string OpName, VOPProfile P> :
 
   let VOP3P = 1;
   let Inst{40-32} = fi;
-  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, ?);
   let Inst{95-72} = dpp8{23-0};
 }
 
@@ -1357,8 +1353,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
   dag src0 = !if(P.HasOMod,
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+    !if(P.HasClamp,
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)),
+    !if(P.HasClamp,
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers)));
 
   list<dag> ret3 = [(set P.DstVT:$vdst,
     (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
@@ -1873,6 +1873,12 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
   }
 }
 
+multiclass VOP3_Real_with_name_gfx11_gfx12_gfx13<
+    bits<10> op, string opName, string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Real_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+  VOP3_Real_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+  VOP3_Real_with_name<GFX13Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
 // for READLANE/WRITELANE
 multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> {
   defvar ps = !cast<VOP_Pseudo>(opName);
@@ -2204,12 +2210,12 @@ include "VOP3PInstructions.td"
 include "VOPDInstructions.td"
 
 class ClassPat<Instruction inst, ValueType vt> : GCNPat <
-  (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+  (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
   (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask))
 >;
 
 class ClassPat_t16<Instruction inst, ValueType vt> : GCNPat <
-  (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+  (i1 (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask))),
   (inst i32:$src0_mods, vt:$src0, SRCMODS.NONE, (V_MOV_B32_e32 timm:$mask))
 >;
 
@@ -2274,3 +2280,12 @@ def VOPTrue16Table : GenericTable {
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getTrue16OpcodeHelper";
 }
+
+def DPMACCInstructionTable : GenericTable {
+  let FilterClass = "VOP_Pseudo";
+  let CppTypeName = "DPMACCInstructionInfo";
+  let Fields = ["Opcode", "IsDPMACCInstruction"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getDPMACCInstructionHelper";
+}