27 files changed, 449 insertions, 69 deletions
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 5c3e26e..4cd51d6 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1114,7 +1114,6 @@ bool AArch64InstPrinter::printSyslAlias(const MCInst *MI,
   } else
     return false;
 
-  std::string Str;
   llvm::transform(Name, Name.begin(), ::tolower);
 
   O << '\t' << Ins << '\t' << Reg.str() << ", " << Name;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1c8383c..54d94b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1466,6 +1466,13 @@ def FeatureClusters : SubtargetFeature< "clusters",
   "Has clusters of workgroups support"
 >;
 
+def FeatureWaitsBeforeSystemScopeStores : SubtargetFeature<
+  "waits-before-system-scope-stores",
+  "RequiresWaitsBeforeSystemScopeStores",
+  "true",
+  "Target requires waits for loads and atomics before system scope stores"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -2060,7 +2067,8 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureMaxHardClauseLength32,
    Feature1_5xVGPRs,
    FeatureMemoryAtomicFAddF32DenormalSupport,
-   FeatureBVHDualAndBVH8Insts
+   FeatureBVHDualAndBVH8Insts,
+   FeatureWaitsBeforeSystemScopeStores,
    ]>;
 
 def FeatureISAVersion12_50 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ac660d5..f377b8a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -290,6 +290,7 @@ protected:
   bool Has45BitNumRecordsBufferResource = false;
 
   bool HasClusters = false;
+  bool RequiresWaitsBeforeSystemScopeStores = false;
 
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;
@@ -1861,6 +1862,10 @@ public:
   bool has45BitNumRecordsBufferResource() const {
     return Has45BitNumRecordsBufferResource;
   }
+
+  bool requiresWaitsBeforeSystemScopeStores() const {
+    return RequiresWaitsBeforeSystemScopeStores;
+  }
 };
 
 class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index a177a42..6ab8d552 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2673,7 +2673,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
   const unsigned Scope = CPol->getImm() & CPol::SCOPE;
 
   // GFX12.0 only: Extra waits needed before system scope stores.
-  if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
+  if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
+      Scope == CPol::SCOPE_SYS)
     Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
 
   return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ebd2e7e..d80a6f3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1874,9 +1874,13 @@ void SIRegisterInfo::buildSpillLoadStore(
     }
 
     bool IsSrcDstDef = SrcDstRegState & RegState::Define;
+    bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
     if (NeedSuperRegImpOperand &&
-        (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef)))
+        (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+      if (PartialReloadCopy)
+        MIB.addReg(ValueReg, RegState::Implicit);
+    }
 
     // The epilog restore of a wwm-scratch register can cause undesired
     // optimization during machine-cp post PrologEpilogInserter if the same
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b6..1931e0b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -838,9 +838,10 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo <
   let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
 }
 
-let Defs = [SCC] in {
-def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
-} // End Defs = [SCC]
+let isCommutable = 1, Defs = [SCC] in
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32",
+  [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))]
+>;
 
 let SubtargetPredicate = isGFX8GFX9 in {
   def S_RFE_RESTORE_B64 : SOP2_Pseudo <
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index 7453727..b60569e 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -421,6 +421,17 @@ def : ProcessorModel<"cortex-m52", CortexM55Model,      [ARMv81mMainline,
                                                          FeatureMVEVectorCostFactor1,
                                                          HasMVEFloatOps]>;
 
+def : ProcessorModel<"star-mc3", CortexM55Model,      [ARMv81mMainline,
+                                                         FeatureDSP,
+                                                         FeatureFPARMv8_D16,
+                                                         FeatureHasNoBranchPredictor,
+                                                         FeaturePACBTI,
+                                                         FeatureUseMISched,
+                                                         FeaturePreferBranchAlign32,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureMVEVectorCostFactor1,
+                                                         HasMVEFloatOps]>;
+
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ca4a655..80c96c6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1701,6 +1701,43 @@ lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
                      DAG.getConstant(Imm, DL, GRLenVT));
 }
 
+/// Lower VECTOR_SHUFFLE whose result is the reversed source vector.
+///
+/// It is possible to do optimization for VECTOR_SHUFFLE performing vector
+/// reverse whose mask likes:
+///   <7, 6, 5, 4, 3, 2, 1, 0>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue
+lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                              SDValue V1, SelectionDAG &DAG,
+                              const LoongArchSubtarget &Subtarget) {
+  // Only vectors with i8/i16 elements which cannot match other patterns
+  // directly needs to do this.
+  if (VT != MVT::v16i8 && VT != MVT::v8i16 && VT != MVT::v32i8 &&
+      VT != MVT::v16i16)
+    return SDValue();
+
+  if (!ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+    return SDValue();
+
+  int WidenNumElts = VT.getVectorNumElements() / 4;
+  SmallVector<int, 16> WidenMask(WidenNumElts, -1);
+  for (int i = 0; i < WidenNumElts; ++i)
+    WidenMask[i] = WidenNumElts - 1 - i;
+
+  MVT WidenVT = MVT::getVectorVT(
+      VT.getVectorElementType() == MVT::i8 ? MVT::i32 : MVT::i64, WidenNumElts);
+  SDValue NewV1 = DAG.getBitcast(WidenVT, V1);
+  SDValue WidenRev = DAG.getVectorShuffle(WidenVT, DL, NewV1,
+                                          DAG.getUNDEF(WidenVT), WidenMask);
+
+  return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT,
+                     DAG.getBitcast(VT, WidenRev),
+                     DAG.getConstant(27, DL, Subtarget.getGRLenVT()));
+}
+
 /// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
 ///
 /// VPACKEV interleaves the even elements from each vector.
@@ -2004,6 +2041,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     if ((Result =
              lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget)))
       return Result;
+    if ((Result =
+             lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget)))
+      return Result;
 
     // TODO: This comment may be enabled in the future to better match the
     // pattern for instruction selection.
@@ -2622,6 +2662,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
       return Result;
     if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
+    if ((Result =
+             lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget)))
+      return Result;
 
     // TODO: This comment may be enabled in the future to better match the
     // pattern for instruction selection.
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 598735f..c923f0e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1082,6 +1082,161 @@ let Predicates = [hasPTX<70>, hasSM<80>] in {
             "mbarrier.pending_count.b64",
             [(set i32:$res, (int_nvvm_mbarrier_pending_count i64:$state))]>;
 }
+
+class MBAR_UTIL<string op, string scope,
+                string space = "", string sem = "",
+                bit tl = 0, bit parity = 0> {
+  // The mbarrier instructions in PTX ISA are of the general form:
+  // mbarrier.op.semantics.scope.space.b64 arg1, arg2 ...
+  // where:
+  // op -> arrive, expect_tx, complete_tx, arrive.expect_tx etc.
+  // semantics -> acquire, release, relaxed (default depends on the op)
+  // scope -> cta or cluster (default is cta-scope)
+  // space -> shared::cta or shared::cluster (default is shared::cta)
+  //
+  // The 'semantics' and 'scope' go together. If one is specified,
+  // then the other _must_ be specified. For example:
+  // (A) mbarrier.arrive             <args> (valid, release and cta are default)
+  // (B) mbarrier.arrive.release.cta <args> (valid, sem/scope mentioned explicitly)
+  // (C) mbarrier.arrive.release     <args> (invalid, needs scope)
+  // (D) mbarrier.arrive.cta         <args> (invalid, needs order)
+  //
+  // Wherever possible, we prefer form (A) to (B) since it is available
+  // from early PTX versions. In most cases, explicitly specifying the
+  // scope requires a later version of PTX.
+  string _scope_asm = !cond(
+                      !eq(scope, "scope_cluster") : "cluster",
+                      !eq(scope, "scope_cta") : !if(!empty(sem), "", "cta"),
+                      true : scope);
+  string _space_asm = !cond(
+                      !eq(space, "space_cta") : "shared",
+                      !eq(space, "space_cluster") : "shared::cluster",
+                      true : space);
+
+  string _parity = !if(parity, "parity", "");
+  string asm_str = StrJoin<".", ["mbarrier", op, _parity,
+                                  sem, _scope_asm, _space_asm, "b64"]>.ret;
+
+  string _intr_suffix = StrJoin<"_", [!subst(".", "_", op), _parity,
+                                      !if(tl, "tl", ""),
+                                      sem, scope, space]>.ret;
+  string intr_name = "int_nvvm_mbarrier_" # _intr_suffix;
+
+  // Predicate checks:
+  // These are used only for the "test_wait/try_wait" variants as they
+  // have evolved since sm80 and are complex. The predicates for the
+  // remaining instructions are straightforward and have already been
+  // applied directly.
+  Predicate _sm_pred = !cond(!or(
+                       !eq(op, "try_wait"),
+                       !eq(scope, "scope_cluster"),
+                       !eq(sem, "relaxed")) : hasSM<90>,
+                       true : hasSM<80>);
+  Predicate _ptx_pred = !cond(
+                        !eq(sem, "relaxed") : hasPTX<86>,
+                        !ne(_scope_asm, "") : hasPTX<80>,
+                        !eq(op, "try_wait") : hasPTX<78>,
+                        parity : hasPTX<71>,
+                        true   : hasPTX<70>);
+  list<Predicate> preds = [_ptx_pred, _sm_pred];
+}
+
+foreach op = ["expect_tx", "complete_tx"] in {
+  foreach scope = ["scope_cta", "scope_cluster"] in {
+    foreach space = ["space_cta", "space_cluster"] in {
+      defvar intr = !cast<Intrinsic>(MBAR_UTIL<op, scope, space>.intr_name);
+      defvar suffix = StrJoin<"_", [op, scope, space]>.ret;
+      def mbar_ # suffix : BasicNVPTXInst<(outs), (ins ADDR:$addr, B32:$tx_count),
+                           MBAR_UTIL<op, scope, space, "relaxed">.asm_str,
+                           [(intr addr:$addr, i32:$tx_count)]>,
+                           Requires<[hasPTX<80>, hasSM<90>]>;
+    } // space
+  } // scope
+} // op
+
+multiclass MBAR_ARR_INTR<string op, string scope, string sem,
+                         list<Predicate> pred = []> {
+  // When either of sem or scope is non-default, both have to
+  // be explicitly specified. So, explicitly state that
+  // sem is `release` when scope is `cluster`.
+  defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")),
+                    "release", sem);
+
+  defvar asm_cta  = MBAR_UTIL<op, scope, "space_cta", asm_sem>.asm_str;
+  defvar intr_cta = !cast<Intrinsic>(MBAR_UTIL<op, scope,
+                                     "space_cta", sem>.intr_name);
+
+  defvar asm_cluster  = MBAR_UTIL<op, scope, "space_cluster", asm_sem>.asm_str;
+  defvar intr_cluster = !cast<Intrinsic>(MBAR_UTIL<op, scope,
+                                         "space_cluster", sem>.intr_name);
+
+  def _CTA : NVPTXInst<(outs B64:$state),
+             (ins ADDR:$addr, B32:$tx_count),
+             asm_cta # " $state, [$addr], $tx_count;",
+             [(set i64:$state, (intr_cta addr:$addr, i32:$tx_count))]>,
+             Requires<pred>;
+  def _CLUSTER : NVPTXInst<(outs),
+                 (ins ADDR:$addr, B32:$tx_count),
+                 asm_cluster # " _, [$addr], $tx_count;",
+                 [(intr_cluster addr:$addr, i32:$tx_count)]>,
+                 Requires<pred>;
+}
+foreach op = ["arrive", "arrive.expect_tx",
+              "arrive_drop", "arrive_drop.expect_tx"] in {
+  foreach scope = ["scope_cta", "scope_cluster"] in {
+    defvar suffix = !subst(".", "_", op) # scope;
+    defm mbar_ # suffix # _release : MBAR_ARR_INTR<op, scope, "", [hasPTX<80>, hasSM<90>]>;
+    defm mbar_ # suffix # _relaxed : MBAR_ARR_INTR<op, scope, "relaxed", [hasPTX<86>, hasSM<90>]>;
+  } // scope
+} // op
+
+multiclass MBAR_WAIT_INTR<string op, string scope, string sem, bit time_limit> {
+  // When either of sem or scope is non-default, both have to
+  // be explicitly specified. So, explicitly state that the
+  // semantics is `acquire` when the scope is `cluster`.
+  defvar asm_sem = !if(!and(!empty(sem), !eq(scope, "scope_cluster")),
+                    "acquire", sem);
+
+  defvar asm_parity  = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+                                 time_limit, 1>.asm_str;
+  defvar pred_parity = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+                                 time_limit, 1>.preds;
+  defvar intr_parity = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta",
+                                        sem, time_limit, 1>.intr_name);
+
+  defvar asm_state  = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+                                time_limit>.asm_str;
+  defvar pred_state = MBAR_UTIL<op, scope, "space_cta", asm_sem,
+                                time_limit>.preds;
+  defvar intr_state = !cast<Intrinsic>(MBAR_UTIL<op, scope, "space_cta",
+                                       sem, time_limit>.intr_name);
+
+  defvar ins_tl_dag = !if(time_limit, (ins B32:$tl), (ins));
+  defvar tl_suffix = !if(time_limit, ", $tl;", ";");
+  defvar intr_state_dag = !con((intr_state addr:$addr, i64:$state),
+                               !if(time_limit, (intr_state i32:$tl), (intr_state)));
+  defvar intr_parity_dag = !con((intr_parity addr:$addr, i32:$phase),
+                               !if(time_limit, (intr_parity i32:$tl), (intr_parity)));
+
+  def _STATE : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B64:$state), ins_tl_dag),
+               asm_state # " $res, [$addr], $state" # tl_suffix,
+               [(set i1:$res, intr_state_dag)]>,
+               Requires<pred_state>;
+  def _PARITY : NVPTXInst<(outs B1:$res), !con((ins ADDR:$addr, B32:$phase), ins_tl_dag),
+                asm_parity # " $res, [$addr], $phase" # tl_suffix,
+                [(set i1:$res, intr_parity_dag)]>,
+                Requires<pred_parity>;
+}
+foreach op = ["test_wait", "try_wait"] in {
+  foreach scope = ["scope_cta", "scope_cluster"] in {
+    foreach time_limit = !if(!eq(op, "try_wait"), [true, false], [false]) in {
+      defvar suffix = StrJoin<"_", [op, scope, !if(time_limit, "tl", "")]>.ret;
+      defm mbar_ # suffix # "_acquire" : MBAR_WAIT_INTR<op, scope, "", time_limit>;
+      defm mbar_ # suffix # "_relaxed" : MBAR_WAIT_INTR<op, scope, "relaxed", time_limit>;
+    } // time_limit
+  } // scope
+} // op
+
 //-----------------------------------
 // Math Functions
 //-----------------------------------
diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
index 92af04a..4695a6f 100644
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -825,8 +825,7 @@ def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read, P10F2_Read, P10F2_Read],
 def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read],
       (instrs
     SRADI_rec,
-    SRAWI_rec,
-    SRAWI8_rec
+    SRAWI8_rec, SRAWI_rec
 )>;
 
 // Single crack instructions
@@ -834,8 +833,7 @@ def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read],
 def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read, P10F2_Read],
       (instrs
     SRAD_rec,
-    SRAW_rec,
-    SRAW8_rec
+    SRAW8_rec, SRAW_rec
 )>;
 
 // 2-way crack instructions
@@ -883,7 +881,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY],
 // 3 Cycles ALU operations, 1 input operands
 def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
       (instrs
-    ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, ADDItocL8, LI, LI8,
+    ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, LI, LI8,
     ADDIC, ADDIC8,
     ADDIS, ADDIS8, ADDISdtprelHA32, ADDIStocHA, ADDIStocHA8, LIS, LIS8,
     ADDME, ADDME8,
@@ -1864,7 +1862,7 @@ def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10ST_Read, P10ST_Read]
       (instrs
     CP_PASTE8_rec, CP_PASTE_rec,
     SLBIEG,
-    TLBIE
+    TLBIE, TLBIE8P9, TLBIEP9
 )>;
 
 // Single crack instructions
@@ -1886,8 +1884,7 @@ def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_DISP_ANY, P10ST_Read, P10ST_Read,
 def : InstRW<[P10W_ST_3C, P10W_DISP_EVEN, P10W_FX_3C, P10W_DISP_ANY],
       (instrs
     ISYNC,
-    SYNCP10,
-    SYNC
+    SYNC, SYNCP10
 )>;
 
 // Expand instructions
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
index 801ae83..3f5f7d3 100644
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -905,7 +905,7 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_3SLOTS_1C],
     SLBIEG,
     STMW,
     STSWI,
-    TLBIE
+    TLBIE, TLBIEP9, TLBIE8P9
 )>;
 
 // Vector Store Instruction
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 4ff2f8a..5d9ec4a 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -409,6 +409,7 @@ def HasP10Vector : Predicate<"Subtarget->hasP10Vector()">;
 def IsISA2_06 : Predicate<"Subtarget->isISA2_06()">;
 def IsISA2_07 : Predicate<"Subtarget->isISA2_07()">;
 def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
+def IsNotISA3_0 : Predicate<"!Subtarget->isISA3_0()">;
 def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">;
 def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">;
 def IsISAFuture : Predicate<"Subtarget->isISAFuture()">;
diff --git a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
index 6bb66bc..043c9e4 100644
--- a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
@@ -29,7 +29,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     ADDIStocHA8,
     ADDIdtprelL32,
     ADDItlsldLADDR32,
-    ADDItocL8,
+    ADDItocL,
     ADDME,
     ADDME8,
     ADDME8O,
@@ -209,7 +209,9 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     SRADI,
     SRADI_32,
     SRAW,
+    SRAW8,
     SRAWI,
+    SRAWI8,
     SRD,
     SRD_rec,
     SRW,
@@ -518,7 +520,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     ADDIStocHA8,
     ADDIdtprelL32,
     ADDItlsldLADDR32,
-    ADDItocL8,
+    ADDItocL,
     ADDME,
     ADDME8,
     ADDME8O,
@@ -747,7 +749,9 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     SRADI,
     SRADI_32,
     SRAW,
+    SRAW8,
     SRAWI,
+    SRAWI8,
     SRD,
     SRD_rec,
     SRW,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index fba1c66..1a77b00 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -850,6 +850,36 @@ class XForm_45<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+class XForm_RSB5_UIMM2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                              string asmstr, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+
+  bits<5> RS;
+  bits<5> RB;
+  bits<2> RIC;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = RS;
+  let Inst{11} = 0;
+  let Inst{12...13} = RIC;
+  let Inst{14...15} = 0;
+  let Inst{16...20} = RB;
+  let Inst{21...30} = xo;
+  let Inst{31} = 0;
+}
+
+class XForm_RSB5_UIMM2_2UIMM1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                              string asmstr, list<dag> pattern>
+    : XForm_RSB5_UIMM2<opcode, xo, OOL, IOL, asmstr, pattern> {
+
+  bits<1> PRS;
+  bits<1> R;
+
+  let Inst{14} = PRS;
+  let Inst{15} = R;
+}
+
 class X_FRT5_XO2_XO3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, bits<10> xo,
                          dag OOL, dag IOL, string asmstr, InstrItinClass itin,
                          list<dag> pattern>
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 1aefea1..b0bed71c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -11,6 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+class XForm_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                list<dag> pattern> : I<opcode, OOL, IOL, asmstr, NoItinerary> {
+  bits<5> RS;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = RS;
+  let Inst{11...20} = 0;
+  let Inst{21...30} = xo;
+  let Inst{31} = 0;
+}
+
 class XOForm_RTAB5_L1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
                       string asmstr, list<dag> pattern>
     : I<opcode, OOL, IOL, asmstr, NoItinerary> {
@@ -294,6 +306,24 @@ let Predicates = [IsISAFuture] in {
   defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT),
                                  (ins g8rc:$RA, g8rc:$RB, u1imm:$L), "subfus",
                                  "$RT, $L, $RA, $RB", []>;
+  def TLBSYNCIO
+      : XForm_RS5<31, 564, (outs), (ins g8rc:$RS), "tlbsyncio $RS", []>;
+  def PTESYNCIO
+      : XForm_RS5<31, 596, (outs), (ins g8rc:$RS), "ptesyncio $RS", []>;
+  def TLBIEP : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs),
+                                       (ins gprc:$RB, gprc:$RS, u2imm:$RIC,
+                                           u1imm:$PRS, u1imm:$R),
+                                       "tlbiep $RB, $RS, $RIC, $PRS, $R", []>;
+  def TLBIEIO
+      : XForm_RSB5_UIMM2<31, 18, (outs), (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC),
+                         "tlbieio $RB, $RS, $RIC", []>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+    def TLBIEP8
+        : XForm_RSB5_UIMM2_2UIMM1<31, 50, (outs),
+                                  (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC,
+                                      u1imm:$PRS, u1imm:$R),
+                                  "tlbiep $RB, $RS, $RIC, $PRS, $R", []>;
+  }
 }
 
 let Predicates = [HasVSX, IsISAFuture] in {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 44d1a44..f399811 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -4321,7 +4321,22 @@ def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB),
                           "tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
 
 def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RST, gprc:$RB),
-                          "tlbie $RB,$RST", IIC_SprTLBIE, []>;
+                     "tlbie $RB, $RST", IIC_SprTLBIE, []>,
+            Requires<[IsNotISA3_0]>;
+
+let Predicates = [IsISA3_0] in {
+  def TLBIEP9 : XForm_RSB5_UIMM2_2UIMM1<31, 306, (outs),
+                                        (ins gprc:$RB, gprc:$RS, u2imm:$RIC,
+                                            u1imm:$PRS, u1imm:$R),
+                                        "tlbie $RB, $RS, $RIC, $PRS, $R", []>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+    def TLBIE8P9
+        : XForm_RSB5_UIMM2_2UIMM1<31, 306, (outs),
+                                  (ins g8rc:$RB, g8rc:$RS, u2imm:$RIC,
+                                      u1imm:$PRS, u1imm:$R),
+                                  "tlbie $RB, $RS, $RIC, $PRS, $R", []>;
+  }
+}
 
 def TLBSX : XForm_tlb<914, (outs), (ins gprc:$RA, gprc:$RB), "tlbsx $RA, $RB",
                 IIC_LdStLoad>, Requires<[IsBookE]>;
@@ -4669,7 +4684,11 @@ def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
 
 }
 
-def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
+def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>,  Requires<[IsNotISA3_0]>;
+let Predicates = [IsISA3_0] in {
+  def : InstAlias<"tlbie $RB", (TLBIEP9 R0, gprc:$RB, 0, 0, 0)>;
+  def : InstAlias<"tlbie $RB, $RS", (TLBIEP9 gprc:$RB, gprc:$RS, 0, 0, 0)>;
+}
 
 def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>,
                 Requires<[IsPPC4xx]>;
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index d76180c..ea41716 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -245,7 +245,10 @@ struct ExtendedBuiltin {
 
 enum InstFlags {
   // It is a half type
-  INST_PRINTER_WIDTH16 = 1
+  INST_PRINTER_WIDTH16 = 1,
+  // It is a 64-bit type
+  INST_PRINTER_WIDTH64 = INST_PRINTER_WIDTH16 << 1,
+
 };
 } // namespace SPIRV
 
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 35a2ee1..62f5e47 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -167,6 +167,36 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
               MI, FirstVariableIndex, OS);
           printRemainingVariableOps(MI, FirstVariableIndex + 1, OS);
           break;
+        case SPIRV::OpSwitch:
+          if (MI->getFlags() & SPIRV::INST_PRINTER_WIDTH64) {
+            // In binary format 64-bit types are split into two 32-bit operands,
+            // but in text format combine these into a single 64-bit value as
+            // this is what tools such as spirv-as require.
+            const unsigned NumOps = MI->getNumOperands();
+            for (unsigned OpIdx = NumFixedOps; OpIdx < NumOps;) {
+              if (OpIdx + 1 >= NumOps || !MI->getOperand(OpIdx).isImm() ||
+                  !MI->getOperand(OpIdx + 1).isImm()) {
+                llvm_unreachable("Unexpected OpSwitch operands");
+                continue;
+              }
+              OS << ' ';
+              uint64_t LowBits = MI->getOperand(OpIdx).getImm();
+              uint64_t HighBits = MI->getOperand(OpIdx + 1).getImm();
+              uint64_t CombinedValue = (HighBits << 32) | LowBits;
+              OS << formatImm(CombinedValue);
+              OpIdx += 2;
+
+              // Next should be the label
+              if (OpIdx < NumOps) {
+                OS << ' ';
+                printOperand(MI, OpIdx, OS);
+                OpIdx++;
+              }
+            }
+          } else {
+            printRemainingVariableOps(MI, NumFixedOps, OS);
+          }
+          break;
         case SPIRV::OpImageSampleImplicitLod:
         case SPIRV::OpImageSampleDrefImplicitLod:
         case SPIRV::OpImageSampleProjImplicitLod:
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 4de9d6a..4c5b81f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -62,7 +62,9 @@ public:
 namespace SPIRV {
 enum AsmComments {
   // It is a half type
-  ASM_PRINTER_WIDTH16 = MachineInstr::TAsmComments
+  ASM_PRINTER_WIDTH16 = MachineInstr::TAsmComments,
+  // It is a 64 bit type
+  ASM_PRINTER_WIDTH64 = ASM_PRINTER_WIDTH16 << 1,
 };
 } // namespace SPIRV
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
index e39666c..9aa07b5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
@@ -25,6 +25,8 @@ void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI,
   // Propagate previously set flags
   if (MI->getAsmPrinterFlags() & SPIRV::ASM_PRINTER_WIDTH16)
     OutMI.setFlags(SPIRV::INST_PRINTER_WIDTH16);
+  if (MI->getAsmPrinterFlags() & SPIRV::ASM_PRINTER_WIDTH64)
+    OutMI.setFlags(SPIRV::INST_PRINTER_WIDTH64);
   const MachineFunction *MF = MI->getMF();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 4e2cc88..8f2fc01 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -105,6 +105,8 @@ void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) {
     uint32_t LowBits = FullImm & 0xffffffff;
     uint32_t HighBits = (FullImm >> 32) & 0xffffffff;
     MIB.addImm(LowBits).addImm(HighBits);
+    // Asm Printer needs this info to print 64-bit operands correctly
+    MIB.getInstr()->setAsmPrinterFlag(SPIRV::ASM_PRINTER_WIDTH64);
     return;
   }
   report_fatal_error("Unsupported constant bitwidth");
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index de28faf..3da720f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1714,7 +1714,7 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
     }
     if (Constraint[1] == '@') {
       if (StringRef("{@cc}").compare(Constraint) == 0)
-        return std::make_pair(0u, &SystemZ::GR32BitRegClass);
+        return std::make_pair(SystemZ::CC, &SystemZ::CCRRegClass);
     }
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
@@ -1766,10 +1766,6 @@ SDValue SystemZTargetLowering::LowerAsmOutputForConstraint(
       OpInfo.ConstraintVT.getSizeInBits() < 8)
     report_fatal_error("Glue output operand is of invalid type");
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.addLiveIn(SystemZ::CC);
-
   if (Glue.getNode()) {
     Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32, Glue);
     Chain = Glue.getValue(1);
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 706ab2b..51b540a 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -14,7 +14,10 @@
 #ifndef LLVM_LIB_TARGET_X86_X86_H
 #define LLVM_LIB_TARGET_X86_X86_H
 
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
@@ -162,7 +165,17 @@ FunctionPass *createX86WinEHUnwindV2Pass();
 
 /// The pass transforms load/store <256 x i32> to AMX load/store intrinsics
 /// or split the data to two <128 x i32>.
-FunctionPass *createX86LowerAMXTypePass();
+class X86LowerAMXTypePass : public PassInfoMixin<X86LowerAMXTypePass> {
+private:
+  const TargetMachine *TM;
+
+public:
+  X86LowerAMXTypePass(const TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXTypeLegacyPass();
 
 /// The pass transforms amx intrinsics to scalar operation if the function has
 /// optnone attribute or it is O0.
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index d979517..2c0443d 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -10,6 +10,7 @@
 /// TODO: Port CodeGen passes to new pass manager.
 //===----------------------------------------------------------------------===//
 
+#include "X86.h"
 #include "X86ISelDAGToDAG.h"
 #include "X86TargetMachine.h"
 
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 0ba71ad..8ffd454 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -46,12 +46,14 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -64,7 +66,7 @@
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "lower-amx-type"
+#define DEBUG_TYPE "x86-lower-amx-type"
 
 static bool isAMXCast(Instruction *II) {
   return match(II,
@@ -137,7 +139,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
 
 class ShapeCalculator {
 private:
-  TargetMachine *TM = nullptr;
+  const TargetMachine *TM = nullptr;
 
   // In AMX intrinsics we let Shape = {Row, Col}, but the
   // RealCol = Col / ElementSize. We may use the RealCol
@@ -145,7 +147,7 @@ private:
   std::map<Value *, Value *> Col2Row, Row2Col;
 
 public:
-  ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {}
+  ShapeCalculator(const TargetMachine *TargetM) : TM(TargetM) {}
   std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
   std::pair<Value *, Value *> getShape(PHINode *Phi);
   Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
@@ -1432,8 +1434,58 @@ bool X86LowerAMXCast::transformAllAMXCast() {
   return Change;
 }
 
+bool lowerAmxType(Function &F, const TargetMachine *TM,
+                  TargetLibraryInfo *TLI) {
+  // Performance optimization: most code doesn't use AMX, so return early if
+  // there are no instructions that produce AMX values. This is sufficient, as
+  // AMX arguments and constants are not allowed -- so any producer of an AMX
+  // value must be an instruction.
+  // TODO: find a cheaper way for this, without looking at all instructions.
+  if (!containsAMXCode(F))
+    return false;
+
+  bool C = false;
+  ShapeCalculator SC(TM);
+  X86LowerAMXCast LAC(F, &SC);
+  C |= LAC.combineAMXcast(TLI);
+  // There might be remaining AMXcast after combineAMXcast and they should be
+  // handled elegantly.
+  C |= LAC.transformAllAMXCast();
+
+  X86LowerAMXType LAT(F, &SC);
+  C |= LAT.visit();
+
+  // Prepare for fast register allocation at O0.
+  // Todo: May better check the volatile model of AMX code, not just
+  // by checking Attribute::OptimizeNone and CodeGenOptLevel::None.
+  if (TM->getOptLevel() == CodeGenOptLevel::None) {
+    // If Front End not use O0 but the Mid/Back end use O0, (e.g.
+    // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make
+    // sure the amx data is volatile, that is necessary for AMX fast
+    // register allocation.
+    if (!F.hasFnAttribute(Attribute::OptimizeNone)) {
+      X86VolatileTileData VTD(F);
+      C = VTD.volatileTileData() || C;
+    }
+  }
+
+  return C;
+}
+
 } // anonymous namespace
 
+PreservedAnalyses X86LowerAMXTypePass::run(Function &F,
+                                           FunctionAnalysisManager &FAM) {
+  TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  bool Changed = lowerAmxType(F, TM, &TLI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
 namespace {
 
 class X86LowerAMXTypeLegacyPass : public FunctionPass {
@@ -1443,44 +1495,10 @@ public:
   X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &F) override {
-    // Performance optimization: most code doesn't use AMX, so return early if
-    // there are no instructions that produce AMX values. This is sufficient, as
-    // AMX arguments and constants are not allowed -- so any producer of an AMX
-    // value must be an instruction.
-    // TODO: find a cheaper way for this, without looking at all instructions.
-    if (!containsAMXCode(F))
-      return false;
-
-    bool C = false;
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
     TargetLibraryInfo *TLI =
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
-    ShapeCalculator SC(TM);
-    X86LowerAMXCast LAC(F, &SC);
-    C |= LAC.combineAMXcast(TLI);
-    // There might be remaining AMXcast after combineAMXcast and they should be
-    // handled elegantly.
-    C |= LAC.transformAllAMXCast();
-
-    X86LowerAMXType LAT(F, &SC);
-    C |= LAT.visit();
-
-    // Prepare for fast register allocation at O0.
-    // Todo: May better check the volatile model of AMX code, not just
-    // by checking Attribute::OptimizeNone and CodeGenOptLevel::None.
-    if (TM->getOptLevel() == CodeGenOptLevel::None) {
-      // If Front End not use O0 but the Mid/Back end use O0, (e.g.
-      // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make
-      // sure the amx data is volatile, that is nessary for AMX fast
-      // register allocation.
-      if (!F.hasFnAttribute(Attribute::OptimizeNone)) {
-        X86VolatileTileData VTD(F);
-        C = VTD.volatileTileData() || C;
-      }
-    }
-
-    return C;
+    return lowerAmxType(F, TM, TLI);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1501,6 +1519,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
                     false)
 
-FunctionPass *llvm::createX86LowerAMXTypePass() {
+FunctionPass *llvm::createX86LowerAMXTypeLegacyPass() {
   return new X86LowerAMXTypeLegacyPass();
 }
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index 3f2a433..fc25d55 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -12,11 +12,16 @@
 
 // NOTE: NO INCLUDE GUARD DESIRED!
 
+#ifndef FUNCTION_PASS
+#define FUNCTION_PASS(NAME, CREATE_PASS)
+#endif
+FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
+#undef FUNCTION_PASS
+
 #ifndef DUMMY_FUNCTION_PASS
 #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
-DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this))
 DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
 DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
 #undef DUMMY_FUNCTION_PASS
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8dd6f3d..9a76abc 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -423,7 +423,7 @@ void X86PassConfig::addIRPasses() {
   // We add both pass anyway and when these two passes run, we skip the pass
   // based on the option level and option attribute.
   addPass(createX86LowerAMXIntrinsicsPass());
-  addPass(createX86LowerAMXTypePass());
+  addPass(createX86LowerAMXTypeLegacyPass());
 
   TargetPassConfig::addIRPasses();