82 files changed, 2956 insertions, 3423 deletions
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 7499613..9ecd390 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -241,6 +241,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate.
    * - ``SPV_KHR_maximal_reconvergence``
      - Adds execution mode and capability to enable maximal reconvergence.
+   * - ``SPV_ALTERA_blocking_pipes``
+     - Adds new pipe read and write functions that have blocking semantics instead of the non-blocking semantics of the existing pipe read/write functions.
 
 SPIR-V representation in LLVM IR
 ================================
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 221d8f1..f585257 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1331,8 +1331,8 @@ public:
       bool SplitDst =
           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
           TargetLowering::TypeSplitVector;
-      if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
-          DstVTy->getElementCount().isVector()) {
+      if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isKnownEven() &&
+          DstVTy->getElementCount().isKnownEven()) {
         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
         const T *TTI = thisT();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 268025e7..9d6038d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -297,6 +297,10 @@ private:
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateIntrinsic(
+      const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+      const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+
   /// When an invoke or a cleanupret unwinds to the next EH pad, there are
   /// many places it could ultimately go. In the IR, we have a single unwind
   /// destination, but in the machine CFG, we enumerate all the possible blocks.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8aeaa9c..2550c2b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3233,11 +3233,6 @@ public:
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
-  /// Return true if the target interleave with shuffles are cheaper
-  virtual bool isProfitableToInterleaveWithGatherScatter() const {
-    return false;
-  }
-
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
index 7cc78d4..fc41641 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
@@ -211,6 +211,21 @@ public:
     return FilteredView(Libraries.begin(), Libraries.end(), S, K);
   }
 
+  using LibraryFilterFn = std::function<bool(const LibraryInfo &)>;
+  void getLibraries(LibState S, PathType K,
+                    std::vector<std::shared_ptr<LibraryInfo>> &Outs,
+                    LibraryFilterFn Filter = nullptr) const {
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
+    for (const auto &[_, Entry] : Libraries) {
+      const auto &Info = *Entry;
+      if (Info.getKind() != K || Info.getState() != S)
+        continue;
+      if (Filter && !Filter(Info))
+        continue;
+      Outs.push_back(Entry);
+    }
+  }
+
   void forEachLibrary(const LibraryVisitor &visitor) const {
     std::unique_lock<std::shared_mutex> Lock(Mtx);
     for (const auto &[_, entry] : Libraries) {
@@ -220,14 +235,14 @@ public:
   }
 
   bool isLoaded(StringRef Path) const {
-    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
     if (auto It = Libraries.find(Path.str()); It != Libraries.end())
       return It->second->getState() == LibState::Loaded;
     return false;
   }
 
   bool isQueried(StringRef Path) const {
-    std::unique_lock<std::shared_mutex> Lock(Mtx);
+    std::shared_lock<std::shared_mutex> Lock(Mtx);
     if (auto It = Libraries.find(Path.str()); It != Libraries.end())
       return It->second->getState() == LibState::Queried;
     return false;
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 9924b90..d7db935 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -176,4 +176,10 @@ def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, l
 
 def int_dx_group_memory_barrier_with_group_sync
     : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>;
+
+def int_dx_load_input
+    : DefaultAttrsIntrinsic<[llvm_any_ty],
+                            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i8_ty,
+                             llvm_i32_ty],
+                            [IntrConvergent]>;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 4fd2204..be1b51f 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2821,20 +2821,34 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (translateKnownIntrinsic(CI, ID, MIRBuilder))
     return true;
 
+  TargetLowering::IntrinsicInfo Info;
+  bool IsTgtMemIntrinsic = TLI->getTgtMemIntrinsic(Info, CI, *MF, ID);
+
+  return translateIntrinsic(CI, ID, MIRBuilder,
+                            IsTgtMemIntrinsic ? &Info : nullptr);
+}
+
+/// Translate a call to an intrinsic.
+/// Depending on whether TLI->getTgtMemIntrinsic() is true, TgtMemIntrinsicInfo
+/// is a pointer to the correspondingly populated IntrinsicInfo object.
+/// Otherwise, this pointer is null.
+bool IRTranslator::translateIntrinsic(
+    const CallBase &CB, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder,
+    const TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
   ArrayRef<Register> ResultRegs;
-  if (!CI.getType()->isVoidTy())
-    ResultRegs = getOrCreateVRegs(CI);
+  if (!CB.getType()->isVoidTy())
+    ResultRegs = getOrCreateVRegs(CB);
 
   // Ignore the callsite attributes. Backend code is most likely not expecting
   // an intrinsic to sometimes have side effects and sometimes not.
   MachineInstrBuilder MIB = MIRBuilder.buildIntrinsic(ID, ResultRegs);
-  if (isa<FPMathOperator>(CI))
-    MIB->copyIRFlags(CI);
+  if (isa<FPMathOperator>(CB))
+    MIB->copyIRFlags(CB);
 
-  for (const auto &Arg : enumerate(CI.args())) {
+  for (const auto &Arg : enumerate(CB.args())) {
     // If this is required to be an immediate, don't materialize it in a
     // register.
-    if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+    if (CB.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
         // imm arguments are more convenient than cimm (and realistically
         // probably sufficient), so use them.
@@ -2863,29 +2877,33 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  TargetLowering::IntrinsicInfo Info;
-  // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    Align Alignment = Info.align.value_or(
-        DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
-    LLT MemTy = Info.memVT.isSimple()
-                    ? getLLTForMVT(Info.memVT.getSimpleVT())
-                    : LLT::scalar(Info.memVT.getStoreSizeInBits());
+  if (TgtMemIntrinsicInfo) {
+    const Function *F = CB.getCalledFunction();
+
+    Align Alignment = TgtMemIntrinsicInfo->align.value_or(DL->getABITypeAlign(
+        TgtMemIntrinsicInfo->memVT.getTypeForEVT(F->getContext())));
+    LLT MemTy =
+        TgtMemIntrinsicInfo->memVT.isSimple()
+            ? getLLTForMVT(TgtMemIntrinsicInfo->memVT.getSimpleVT())
+            : LLT::scalar(TgtMemIntrinsicInfo->memVT.getStoreSizeInBits());
 
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
     //       didn't yield anything useful.
     MachinePointerInfo MPI;
-    if (Info.ptrVal)
-      MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
-    else if (Info.fallbackAddressSpace)
-      MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+    if (TgtMemIntrinsicInfo->ptrVal) {
+      MPI = MachinePointerInfo(TgtMemIntrinsicInfo->ptrVal,
+                               TgtMemIntrinsicInfo->offset);
+    } else if (TgtMemIntrinsicInfo->fallbackAddressSpace) {
+      MPI = MachinePointerInfo(*TgtMemIntrinsicInfo->fallbackAddressSpace);
+    }
     MIB.addMemOperand(MF->getMachineMemOperand(
-        MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata(),
-        /*Ranges=*/nullptr, Info.ssid, Info.order, Info.failureOrder));
+        MPI, TgtMemIntrinsicInfo->flags, MemTy, Alignment, CB.getAAMetadata(),
+        /*Ranges=*/nullptr, TgtMemIntrinsicInfo->ssid,
+        TgtMemIntrinsicInfo->order, TgtMemIntrinsicInfo->failureOrder));
   }
 
-  if (CI.isConvergent()) {
-    if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+  if (CB.isConvergent()) {
+    if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) {
       auto *Token = Bundle->Inputs[0].get();
       Register TokenReg = getOrCreateVReg(*Token);
       MIB.addUse(TokenReg, RegState::Implicit);
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 45eca28..5c27a20 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,8 +239,7 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
-                               unsigned MaxFactor,
-                               bool InterleaveWithShuffles) {
+                               unsigned MaxFactor) {
   unsigned NumElts = SVI->getShuffleMask().size();
   if (NumElts < 4)
     return false;
@@ -251,13 +250,6 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
       return true;
   }
 
-  if (InterleaveWithShuffles) {
-    for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
-      Factor = i * MaxFactor;
-      if (SVI->isInterleave(Factor))
-        return true;
-    }
-  }
   return false;
 }
 
@@ -536,8 +528,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
       cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  if (!isReInterleaveMask(SVI, Factor, MaxFactor,
-                          TLI->isProfitableToInterleaveWithGatherScatter()))
+  if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 46c4bb8..816b7ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4046,6 +4046,8 @@ static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
                                     m_ConstInt(AndMask)))) {
     // Type Legalisation Pattern:
     // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
+    if (BitWidthDiff.getZExtValue() >= BitWidth)
+      return SDValue();
     unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
     if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
       return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fa0c899..9961c98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3526,8 +3526,7 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
 
   // Update successor info.
   addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
-  for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
-    BasicBlock *Dest = I.getIndirectDest(i);
+  for (BasicBlock *Dest : I.getIndirectDests()) {
     MachineBasicBlock *Target = FuncInfo.getMBB(Dest);
     Target->setIsInlineAsmBrIndirectTarget();
     // If we introduce a type of asm goto statement that is permitted to use an
@@ -5313,18 +5312,26 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   DAG.setRoot(OutChain);
 }
 
-/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
-/// node.
-void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
-                                               unsigned Intrinsic) {
-  // Ignore the callsite's attributes. A specific call site may be marked with
-  // readnone, but the lowering code will expect the chain based on the
-  // definition.
+/// Check if this intrinsic call depends on the chain (1st return value)
+/// and if it only *loads* memory.
+/// Ignore the callsite's attributes. A specific call site may be marked with
+/// readnone, but the lowering code will expect the chain based on the
+/// definition.
+std::pair<bool, bool>
+SelectionDAGBuilder::getTargetIntrinsicCallProperties(const CallBase &I) {
   const Function *F = I.getCalledFunction();
   bool HasChain = !F->doesNotAccessMemory();
   bool OnlyLoad =
       HasChain && F->onlyReadsMemory() && F->willReturn() && F->doesNotThrow();
 
+  return {HasChain, OnlyLoad};
+}
+
+SmallVector<SDValue, 8> SelectionDAGBuilder::getTargetIntrinsicOperands(
+    const CallBase &I, bool HasChain, bool OnlyLoad,
+    TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   // Build the operand list.
   SmallVector<SDValue, 8> Ops;
   if (HasChain) {  // If this intrinsic has side-effects, chainify it.
@@ -5336,17 +5343,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
-  // Info is set by getTgtMemIntrinsic
-  TargetLowering::IntrinsicInfo Info;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
-                                               DAG.getMachineFunction(),
-                                               Intrinsic);
-
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
-  if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
-      Info.opc == ISD::INTRINSIC_W_CHAIN)
-    Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
+  if (!TgtMemIntrinsicInfo || TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_VOID ||
+      TgtMemIntrinsicInfo->opc == ISD::INTRINSIC_W_CHAIN)
+    Ops.push_back(DAG.getTargetConstant(I.getIntrinsicID(), getCurSDLoc(),
                                         TLI.getPointerTy(DAG.getDataLayout())));
 
   // Add all operands of the call to the operand list.
@@ -5369,13 +5369,85 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
   }
 
+  if (std::optional<OperandBundleUse> Bundle =
+          I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    Value *Token = Bundle->Inputs[0].get();
+    SDValue ConvControlToken = getValue(Token);
+    assert(Ops.back().getValueType() != MVT::Glue &&
+           "Did not expect another glue node here.");
+    ConvControlToken =
+        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
+    Ops.push_back(ConvControlToken);
+  }
+
+  return Ops;
+}
+
+SDVTList SelectionDAGBuilder::getTargetIntrinsicVTList(const CallBase &I,
+                                                       bool HasChain) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
 
-  SDVTList VTs = DAG.getVTList(ValueVTs);
+  return DAG.getVTList(ValueVTs);
+}
+
+/// Get an INTRINSIC node for a target intrinsic which does not touch memory.
+SDValue SelectionDAGBuilder::getTargetNonMemIntrinsicNode(
+    const Type &IntrinsicVT, bool HasChain, ArrayRef<SDValue> Ops,
+    const SDVTList &VTs) {
+  if (!HasChain)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
+  if (!IntrinsicVT.isVoidTy())
+    return DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
+  return DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+}
+
+/// Set root, convert return type if necessary and check alignment.
+SDValue SelectionDAGBuilder::handleTargetIntrinsicRet(const CallBase &I,
+                                                      bool HasChain,
+                                                      bool OnlyLoad,
+                                                      SDValue Result) {
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues() - 1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (I.getType()->isVoidTy())
+    return Result;
+
+  if (MaybeAlign Alignment = I.getRetAlign(); InsertAssertAlign && Alignment) {
+    // Insert `assertalign` node if there's an alignment.
+    Result = DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+  } else if (!isa<VectorType>(I.getType())) {
+    Result = lowerRangeToAssertZExt(DAG, I, Result);
+  }
+
+  return Result;
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  auto [HasChain, OnlyLoad] = getTargetIntrinsicCallProperties(I);
+
+  // Info is set by getTgtMemIntrinsic
+  TargetLowering::IntrinsicInfo Info;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtMemIntrinsic =
+      TLI.getTgtMemIntrinsic(Info, I, DAG.getMachineFunction(), Intrinsic);
+
+  SmallVector<SDValue, 8> Ops = getTargetIntrinsicOperands(
+      I, HasChain, OnlyLoad, IsTgtMemIntrinsic ? &Info : nullptr);
+  SDVTList VTs = getTargetIntrinsicVTList(I, HasChain);
 
   // Propagate fast-math-flags from IR to node(s).
   SDNodeFlags Flags;
@@ -5386,19 +5458,9 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   // Create the node.
   SDValue Result;
 
-  if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
-    auto *Token = Bundle->Inputs[0].get();
-    SDValue ConvControlToken = getValue(Token);
-    assert(Ops.back().getValueType() != MVT::Glue &&
-           "Did not expected another glue node here.");
-    ConvControlToken =
-        DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, MVT::Glue, ConvControlToken);
-    Ops.push_back(ConvControlToken);
-  }
-
   // In some cases, custom collection of operands from CallInst I may be needed.
   TLI.CollectTargetIntrinsicOperands(I, Ops, DAG);
-  if (IsTgtIntrinsic) {
+  if (IsTgtMemIntrinsic) {
     // This is target intrinsic that touches memory
     //
     // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
@@ -5418,34 +5480,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
         Info.ssid, Info.order, Info.failureOrder);
     Result =
         DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, MemVT, MMO);
-  } else if (!HasChain) {
-    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
-  } else if (!I.getType()->isVoidTy()) {
-    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
   } else {
-    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+    Result = getTargetNonMemIntrinsicNode(*I.getType(), HasChain, Ops, VTs);
   }
 
-  if (HasChain) {
-    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
-    if (OnlyLoad)
-      PendingLoads.push_back(Chain);
-    else
-      DAG.setRoot(Chain);
-  }
-
-  if (!I.getType()->isVoidTy()) {
-    if (!isa<VectorType>(I.getType()))
-      Result = lowerRangeToAssertZExt(DAG, I, Result);
-
-    MaybeAlign Alignment = I.getRetAlign();
-
-    // Insert `assertalign` node if there's an alignment.
-    if (InsertAssertAlign && Alignment) {
-      Result =
-          DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
-    }
-  }
+  Result = handleTargetIntrinsicRet(I, HasChain, OnlyLoad, Result);
 
   setValue(&I, Result);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 47e19f7..ed63bee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -727,6 +727,17 @@ private:
                        MCSymbol *&BeginLabel);
   SDValue lowerEndEH(SDValue Chain, const InvokeInst *II,
                      const BasicBlock *EHPadBB, MCSymbol *BeginLabel);
+
+  std::pair<bool, bool> getTargetIntrinsicCallProperties(const CallBase &I);
+  SmallVector<SDValue, 8> getTargetIntrinsicOperands(
+      const CallBase &I, bool HasChain, bool OnlyLoad,
+      TargetLowering::IntrinsicInfo *TgtMemIntrinsicInfo = nullptr);
+  SDVTList getTargetIntrinsicVTList(const CallBase &I, bool HasChain);
+  SDValue getTargetNonMemIntrinsicNode(const Type &IntrinsicVT, bool HasChain,
+                                       ArrayRef<SDValue> Ops,
+                                       const SDVTList &VTs);
+  SDValue handleTargetIntrinsicRet(const CallBase &I, bool HasChain,
+                                   bool OnlyLoad, SDValue Result);
 };
 
 /// This struct represents the registers (physical or virtual)
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
index 35da82a..7e1d528 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -184,9 +184,9 @@ class SymbolSearchContext {
 public:
   SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
 
-  bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); }
+  bool hasSearched(const LibraryInfo *Lib) const { return Searched.count(Lib); }
 
-  void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); }
+  void markSearched(const LibraryInfo *Lib) { Searched.insert(Lib); }
 
   inline bool allResolved() const { return Q.allResolved(); }
 
@@ -194,7 +194,7 @@ public:
 
 private:
   SymbolQuery &Q;
-  DenseSet<LibraryInfo *> Searched;
+  DenseSet<const LibraryInfo *> Searched;
 };
 
 void LibraryResolver::resolveSymbolsInLibrary(
@@ -226,19 +226,18 @@ void LibraryResolver::resolveSymbolsInLibrary(
           return EnumerateResult::Continue;
         },
         Opts);
+  };
 
+  if (!Lib.hasFilter()) {
+    LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+                      << "\n";);
+    enumerateSymbolsIfNeeded();
     if (DiscoveredSymbols.empty()) {
       LLVM_DEBUG(dbgs() << "  No symbols and remove library : "
                         << Lib.getFullPath() << "\n";);
       LibMgr.removeLibrary(Lib.getFullPath());
       return;
     }
-  };
-
-  if (!Lib.hasFilter()) {
-    LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
-                      << "\n";);
-    enumerateSymbolsIfNeeded();
     SmallVector<StringRef> SymbolVec;
     SymbolVec.reserve(DiscoveredSymbols.size());
     for (const auto &KV : DiscoveredSymbols)
@@ -288,11 +287,15 @@ void LibraryResolver::searchSymbolsInLibraries(
 
     SymbolSearchContext Ctx(Q);
     while (!Ctx.allResolved()) {
+      std::vector<std::shared_ptr<LibraryInfo>> Libs;
+      LibMgr.getLibraries(S, K, Libs, [&](const LibraryInfo &Lib) {
+        return !Ctx.hasSearched(&Lib);
+      });
 
-      for (auto &Lib : LibMgr.getView(S, K)) {
-        if (Ctx.hasSearched(Lib.get()))
-          continue;
+      if (Libs.empty() && !scanLibrariesIfNeeded(K, scanBatchSize))
+        break; // no more new libs to scan
 
+      for (auto &Lib : Libs) {
         // can use Async here?
         resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
         Ctx.markSearched(Lib.get());
@@ -300,12 +303,6 @@ void LibraryResolver::searchSymbolsInLibraries(
         if (Ctx.allResolved())
           return;
       }
-
-      if (Ctx.allResolved())
-        return;
-
-      if (!scanLibrariesIfNeeded(K, scanBatchSize))
-        break; // no more new libs to scan
     }
   };
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
index d93f686..32f6dbe 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -50,7 +50,7 @@ void handleError(Error Err, StringRef context = "") {
 }
 
 bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
-  Triple HostTriple(sys::getDefaultTargetTriple());
+  Triple HostTriple(sys::getProcessTriple());
   Triple ObjTriple = Obj.makeTriple();
 
   LLVM_DEBUG({
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2987468..40e6400 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,6 +50,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -96,7 +97,6 @@
 #include <cctype>
 #include <cstdint>
 #include <cstdlib>
-#include <deque>
 #include <iterator>
 #include <limits>
 #include <optional>
@@ -105,7 +105,6 @@
 #include <vector>
 
 using namespace llvm;
-using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "aarch64-lower"
 
@@ -1175,6 +1174,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
+  setTargetDAGCombine(ISD::CTPOP);
 
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
@@ -11331,9 +11331,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
     break;
   }
 
+  // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
+  // prefer using SVE if available.
   if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(
-          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
+      useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
     switch (Opcode) {
     default:
       llvm_unreachable("Wrong instruction");
@@ -17555,6 +17556,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
     // udot instruction.
     if (SrcWidth * 4 <= DstWidth) {
       if (all_of(I->users(), [&](auto *U) {
+            using namespace llvm::PatternMatch;
             auto *SingleUser = cast<Instruction>(&*U);
             if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
               return true;
@@ -17826,6 +17828,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // into shift / and masks. For the moment we do this just for uitofp (not
   // zext) to avoid issues with widening instructions.
   if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
+        using namespace llvm::PatternMatch;
         return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
                SI->getType()->getScalarSizeInBits() * 4 ==
                    SI->user_back()->getType()->getScalarSizeInBits();
@@ -17990,17 +17993,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                                                   unsigned Factor,
                                                   const APInt &GapMask) const {
 
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
-
-  if (isProfitableToInterleaveWithGatherScatter() &&
-      Factor > getMaxSupportedInterleaveFactor())
-    return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
-
-  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
-         "Invalid interleave factor");
-
   assert(!LaneMask && GapMask.popcount() == Factor &&
          "Unexpected mask on store");
 
@@ -18146,126 +18143,6 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
   return true;
 }
 
-/// If the interleaved vector elements are greater than supported MaxFactor,
-/// interleaving the data with additional shuffles can be used to
-/// achieve the same.
-///
-/// Consider the following data with 8 interleaves which are shuffled to store
-/// stN instructions. Data needs to be stored in this order:
-///     [v0, v1, v2, v3, v4, v5, v6, v7]
-///
-///    v0      v4      v2      v6      v1      v5      v3      v7
-///    |       |       |       |       |       |       |       |
-///     \     /         \     /         \     /         \     /
-///   [zip v0,v4]      [zip v2,v6]    [zip v1,v5]      [zip v3,v7] ==> stN = 4
-///        |               |              |                 |
-///         \             /                \               /
-///          \           /                  \             /
-///           \         /                    \           /
-///       [zip [v0,v2,v4,v6]]            [zip [v1,v3,v5,v7]]     ==> stN = 2
-///
-/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
-/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
-/// another st4.
-///
-/// For stN = 2, upper half of interleaved data V0, V1 is stored
-/// with one st2 instruction. Second set V2, V3 is stored with another st2.
-/// Total of 4 st2's are required here.
-bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
-    StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
-  unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
-
-  auto *VecTy = cast<FixedVectorType>(SVI->getType());
-  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
-
-  unsigned LaneLen = VecTy->getNumElements() / Factor;
-  Type *EltTy = VecTy->getElementType();
-  auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
-
-  const DataLayout &DL = SI->getModule()->getDataLayout();
-  bool UseScalable;
-
-  // Skip if we do not have NEON and skip illegal vector types. We can
-  // "legalize" wide vector types into multiple interleaved accesses as long as
-  // the vector types are divisible by 128.
-  if (!Subtarget->hasNEON() ||
-      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-    return false;
-
-  if (UseScalable)
-    return false;
-
-  std::deque<Value *> Shuffles;
-  Shuffles.push_back(SVI);
-  unsigned ConcatLevel = Factor;
-  // Getting all the interleaved operands.
-  while (ConcatLevel > 1) {
-    unsigned InterleavedOperands = Shuffles.size();
-    for (unsigned i = 0; i < InterleavedOperands; i++) {
-      ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front());
-      if (!SFL)
-        return false;
-      Shuffles.pop_front();
-
-      Value *Op0 = SFL->getOperand(0);
-      Value *Op1 = SFL->getOperand(1);
-
-      Shuffles.push_back(dyn_cast<Value>(Op0));
-      Shuffles.push_back(dyn_cast<Value>(Op1));
-    }
-    ConcatLevel >>= 1;
-  }
-
-  IRBuilder<> Builder(SI);
-  auto Mask = createInterleaveMask(LaneLen, 2);
-  SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
-  for (unsigned i = 0; i < LaneLen; i++) {
-    LowerHalfMask[i] = Mask[i];
-    UpperHalfMask[i] = Mask[i + LaneLen];
-  }
-
-  unsigned InterleaveFactor = Factor >> 1;
-  while (InterleaveFactor >= MaxSupportedFactor) {
-    std::deque<Value *> ShufflesIntermediate;
-    ShufflesIntermediate.resize(Factor);
-    for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
-      for (unsigned i = 0; i < InterleaveFactor; i++) {
-        auto *Shuffle = Builder.CreateShuffleVector(
-            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask);
-        ShufflesIntermediate[i + j] = Shuffle;
-        Shuffle = Builder.CreateShuffleVector(
-            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask);
-        ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle;
-      }
-    }
-    Shuffles = ShufflesIntermediate;
-    InterleaveFactor >>= 1;
-  }
-
-  Type *PtrTy = SI->getPointerOperandType();
-  auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
-
-  Value *BaseAddr = SI->getPointerOperand();
-  Function *StNFunc = getStructuredStoreFunction(
-      SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
-  for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
-    SmallVector<Value *, 5> Ops;
-    for (unsigned j = 0; j < MaxSupportedFactor; j++)
-      Ops.push_back(Shuffles[i * MaxSupportedFactor + j]);
-
-    if (i > 0) {
-      // We will compute the pointer operand of each store from the original
-      // base address using GEPs. Cast the base address to a pointer to the
-      // scalar  element type.
-      BaseAddr = Builder.CreateConstGEP1_32(
-          SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
-    }
-    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
-    Builder.CreateCall(StNFunc, Ops);
-  }
-  return true;
-}
-
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
@@ -27968,6 +27845,35 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
       {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
 }
 
+static SDValue performCTPOPCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG) {
+  using namespace llvm::SDPatternMatch;
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+  SDValue Mask;
+  if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  EVT MaskVT = Mask.getValueType();
+
+  if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
+      MaskVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ReduceInVT =
+      EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+
+  SDLoc DL(N);
+  // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
+  SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
+  SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
+  return DAG.getNegative(NegPopCount, DL, VT);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -28313,6 +28219,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performScalarToVectorCombine(N, DCI, DAG);
   case ISD::SHL:
     return performSHLCombine(N, DCI, DAG);
+  case ISD::CTPOP:
+    return performCTPOPCombine(N, DCI, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index bfd8474..70bfae7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,10 +229,6 @@ public:
 
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
 
-  bool isProfitableToInterleaveWithGatherScatter() const override {
-    return true;
-  }
-
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -243,9 +239,6 @@ public:
                              ShuffleVectorInst *SVI, unsigned Factor,
                              const APInt &GapMask) const override;
 
-  bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
-                                        unsigned Factor) const;
-
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8729ed3..197aae6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4922,36 +4922,11 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
     return InstructionCost::getInvalid();
 
-  unsigned NumLoadStores = 1;
-  InstructionCost ShuffleCost = 0;
-  bool isInterleaveWithShuffle = false;
-  unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
-
-  auto *SubVecTy =
-      VectorType::get(VecVTy->getElementType(),
-                      VecVTy->getElementCount().divideCoefficientBy(Factor));
-
-  if (TLI->isProfitableToInterleaveWithGatherScatter() &&
-      Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
-      Factor > MaxSupportedFactor) {
-    isInterleaveWithShuffle = true;
-    SmallVector<int, 16> Mask;
-    // preparing interleave Mask.
-    for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
-         i++) {
-      for (unsigned j = 0; j < 2; j++)
-        Mask.push_back(j * Factor + i);
-    }
-
-    NumLoadStores = Factor / MaxSupportedFactor;
-    ShuffleCost =
-        (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
-                                 Mask, CostKind, 0, SubVecTy));
-  }
-
-  if (!UseMaskForGaps &&
-      (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
+  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
+    auto *SubVecTy =
+        VectorType::get(VecVTy->getElementType(),
+                        VecVTy->getElementCount().divideCoefficientBy(Factor));
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
@@ -4959,10 +4934,7 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
     bool UseScalable;
     if (MinElts % Factor == 0 &&
         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-      return (Factor *
-              TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
-              NumLoadStores) +
-             ShuffleCost;
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e3b0a1b..e62fdb6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -312,7 +312,7 @@ public:
   }
 
   bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const {
-    if (!ST->hasSVE())
+    if (!ST->isSVEorStreamingSVEAvailable())
       return false;
 
     // For fixed vectors, avoid scalarization if using SVE for them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4fe194c..54d94b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2366,18 +2366,6 @@ def isGFX8GFX9NotGFX90A :
             " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
   AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
 
-// Pre-90A GFX9s allow the NV bit in FLAT instructions.
-def isNVAllowedInFlat :
-  Predicate<"!Subtarget->hasGFX90AInsts() &&"
-            " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
-  AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX90AInsts), (not FeatureGFX10Insts))>;
-
-// GFX8 or GFX90A+ do not allow the NV bit in FLAT instructions.
-def isNVNotAllowedInFlat :
-  Predicate<"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) ||"
-            " ((Subtarget->getGeneration() == AMDGPUSubtarget::GFX9) && Subtarget->hasGFX90AInsts())">,
-  AssemblerPredicate <(any_of FeatureVolcanicIslands, FeatureGFX90AInsts)>;
-
 def isGFX90AOnly :
   Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
   AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2808c44..09338c5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1602,11 +1602,6 @@ public:
 
   bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
 
-  bool isFlatInstAndNVAllowed(const MCInst &Inst) const {
-    uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
-    return (TSFlags & SIInstrFlags::FLAT) && isGFX9() && !isGFX90A();
-  }
-
   AMDGPUTargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -5375,7 +5370,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
       Error(S, "scale_offset is not supported on this GPU");
     }
-    if ((CPol & CPol::NV) && !isFlatInstAndNVAllowed(Inst)) {
+    if (CPol & CPol::NV) {
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
       StringRef CStr(S.getPointer());
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
@@ -7150,13 +7145,6 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
   unsigned Enabled = 0, Seen = 0;
   for (;;) {
     SMLoc S = getLoc();
-
-    if (isGFX9() && trySkipId("nv")) {
-      Enabled |= CPol::NV;
-      Seen |= CPol::NV;
-      continue;
-    }
-
     bool Disabling;
     unsigned CPol = getCPolKind(getId(), Mnemo, Disabling);
     if (!CPol)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6ef2241..8ea64d1 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -125,7 +125,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<7> saddr;
   bits<10> vdst;
 
-  bits<6> cpol;
+  bits<5> cpol;
 
   // Only valid on gfx9
   bits<1> lds = ps.lds; // LDS DMA for global and scratch
@@ -2693,52 +2693,29 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
                   !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
 }
 
-class FLAT_Real_vi_ex_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
-  FLAT_Real_vi <op, ps, has_sccb> {
-  let AssemblerPredicate = isNVNotAllowedInFlat;
-}
-
-class FLAT_Real_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
-  FLAT_Real_vi <op, ps, has_sccb> {
-  let AssemblerPredicate = isNVAllowedInFlat;
-  let Subtarget = SIEncodingFamily.GFX9;
-  let DecoderNamespace = "GFX9";
-  let Inst{55} = cpol{CPolBit.NV}; // nv - GFX9 (pre-90A) uses bit 55 as the non-volatile bit.
-}
-
-multiclass FLAT_Real_mc_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> {
-  def _vi: FLAT_Real_vi_ex_gfx9<op, ps, has_sccb>;
-  def _gfx9: FLAT_Real_gfx9<op, ps, has_sccb>;
-}
-
 multiclass FLAT_Real_AllAddr_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
-  defm _SADDR : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
-}
-
-multiclass FLAT_Real_AllAddr_vi_ex_gfx9<bits<7> op,
-  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
-  def _SADDR_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
 }
 
 class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
   FLAT_Real <op, ps>,
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
   let AssemblerPredicate = isGFX940Plus;
-  let DecoderNamespace = "GFX940";
+  let DecoderNamespace = "GFX9";
   let Inst{13} = ps.sve;
   let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
 }
 
 multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
-  let OtherPredicates = [isGFX8GFX9NotGFX940] in {
-    defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME)>;
+  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
+    let AssemblerPredicate = isGFX8GFX9NotGFX940;
+    let OtherPredicates = [isGFX8GFX9NotGFX940];
+  }
+  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
+    let DecoderNamespace = "GFX9";
   }
-
-  defm _SADDR_vi : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
-
   let AssemblerPredicate = isGFX940Plus in {
     def _VE_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
     def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
@@ -2751,11 +2728,11 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
 
   let OtherPredicates = [isGFX8GFX9NotGFX940] in {
-    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in {
-      defm "" : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+    def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
+      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
     }
-    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in {
-      defm _SADDR : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+    def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
+      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
     }
   }
 
@@ -2771,66 +2748,47 @@ multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
   def _ST_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
 }
 
-defm FLAT_LOAD_UBYTE_vi         : FLAT_Real_mc_vi <0x10, FLAT_LOAD_UBYTE>;
-defm FLAT_LOAD_SBYTE_vi         : FLAT_Real_mc_vi <0x11, FLAT_LOAD_SBYTE>;
-defm FLAT_LOAD_USHORT_vi        : FLAT_Real_mc_vi <0x12, FLAT_LOAD_USHORT>;
-defm FLAT_LOAD_SSHORT_vi        : FLAT_Real_mc_vi <0x13, FLAT_LOAD_SSHORT>;
-defm FLAT_LOAD_DWORD_vi         : FLAT_Real_mc_vi <0x14, FLAT_LOAD_DWORD>;
-defm FLAT_LOAD_DWORDX2_vi       : FLAT_Real_mc_vi <0x15, FLAT_LOAD_DWORDX2>;
-defm FLAT_LOAD_DWORDX4_vi       : FLAT_Real_mc_vi <0x17, FLAT_LOAD_DWORDX4>;
-defm FLAT_LOAD_DWORDX3_vi       : FLAT_Real_mc_vi <0x16, FLAT_LOAD_DWORDX3>;
-
-defm FLAT_STORE_BYTE_vi         : FLAT_Real_mc_vi <0x18, FLAT_STORE_BYTE>;
-defm FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_mc_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
-defm FLAT_STORE_SHORT_vi        : FLAT_Real_mc_vi <0x1a, FLAT_STORE_SHORT>;
-defm FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
-defm FLAT_STORE_DWORD_vi        : FLAT_Real_mc_vi <0x1c, FLAT_STORE_DWORD>;
-defm FLAT_STORE_DWORDX2_vi      : FLAT_Real_mc_vi <0x1d, FLAT_STORE_DWORDX2>;
-defm FLAT_STORE_DWORDX4_vi      : FLAT_Real_mc_vi <0x1f, FLAT_STORE_DWORDX4>;
-defm FLAT_STORE_DWORDX3_vi      : FLAT_Real_mc_vi <0x1e, FLAT_STORE_DWORDX3>;
-
-defm FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_mc_vi <0x20, FLAT_LOAD_UBYTE_D16>;
-defm FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
-defm FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_mc_vi <0x22, FLAT_LOAD_SBYTE_D16>;
-defm FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
-defm FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_mc_vi <0x24, FLAT_LOAD_SHORT_D16>;
-defm FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
+def FLAT_LOAD_UBYTE_vi         : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_vi         : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_vi        : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_vi        : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_vi         : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_vi       : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_vi       : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_vi       : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_vi         : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
+def FLAT_STORE_SHORT_vi        : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
+def FLAT_STORE_DWORD_vi        : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_vi      : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_vi      : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_vi      : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+def FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>;
+def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
+def FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>;
+def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
+def FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
+def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
 
 multiclass FLAT_Real_Atomics_vi <bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
   defvar ps = !cast<FLAT_Pseudo>(NAME);
-  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
-  defm _RTN : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
-  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
-}
-
-multiclass FLAT_Real_Atomics_vi_ex_gfx9 <bits<7> op,
-  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  defvar ps = !cast<FLAT_Pseudo>(NAME);
-  def _vi     : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
-  def _RTN_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
-
-  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+  def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+  def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+  def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
 }
 
 multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
   FLAT_Real_AllAddr_vi<op, has_sccb> {
-  defm _RTN  : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
-  defm _SADDR_RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
-
-  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
-  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
-}
-
-multiclass FLAT_Global_Real_Atomics_vi_ex_gfx9<bits<7> op,
-  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
-  FLAT_Real_AllAddr_vi_ex_gfx9<op, has_sccb> {
-  def _RTN_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
-  def _SADDR_RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+  def _RTN_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+  def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
 
-  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
-  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+  def _RTN_agpr_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+  def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
 }
 
 defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40>;
@@ -2992,10 +2950,10 @@ let AssemblerPredicate = isGFX940Plus in {
   defm GLOBAL_ATOMIC_ADD_F64     : FLAT_Global_Real_Atomics_gfx940<0x4f>;
   defm GLOBAL_ATOMIC_MIN_F64     : FLAT_Global_Real_Atomics_gfx940<0x50>;
   defm GLOBAL_ATOMIC_MAX_F64     : FLAT_Global_Real_Atomics_gfx940<0x51>;
-  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi_ex_gfx9<0x4d>;
-  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi_ex_gfx9<0x4e>;
-  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi_ex_gfx9<0x52>;
-  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi_ex_gfx9<0x52>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52>;
+  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
 } // End AssemblerPredicate = isGFX940Plus
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 3e6f35d..703ec0a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -186,12 +186,8 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
     O << " dlc";
   if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
     O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
-  if (Imm & ~CPol::ALL_pregfx12) {
-    if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI))
-      O << " nv";
-    else
-      O << " /* unexpected cache policy bit */";
-  }
+  if (Imm & ~CPol::ALL_pregfx12)
+    O << " /* unexpected cache policy bit */";
 }
 
 void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index c89212d..90a4723 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -756,6 +756,155 @@ LoongArchInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   return ArrayRef(TargetFlags);
 }
 
+bool LoongArchInstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
+                                             Register Reg,
+                                             const MachineInstr &AddrI,
+                                             ExtAddrMode &AM) const {
+  enum MemIOffsetType {
+    Imm14Shift2,
+    Imm12,
+    Imm11Shift1,
+    Imm10Shift2,
+    Imm9Shift3,
+    Imm8,
+    Imm8Shift1,
+    Imm8Shift2,
+    Imm8Shift3
+  };
+
+  MemIOffsetType OT;
+  switch (MemI.getOpcode()) {
+  default:
+    return false;
+  case LoongArch::LDPTR_W:
+  case LoongArch::LDPTR_D:
+  case LoongArch::STPTR_W:
+  case LoongArch::STPTR_D:
+    OT = Imm14Shift2;
+    break;
+  case LoongArch::LD_B:
+  case LoongArch::LD_H:
+  case LoongArch::LD_W:
+  case LoongArch::LD_D:
+  case LoongArch::LD_BU:
+  case LoongArch::LD_HU:
+  case LoongArch::LD_WU:
+  case LoongArch::ST_B:
+  case LoongArch::ST_H:
+  case LoongArch::ST_W:
+  case LoongArch::ST_D:
+  case LoongArch::FLD_S:
+  case LoongArch::FLD_D:
+  case LoongArch::FST_S:
+  case LoongArch::FST_D:
+  case LoongArch::VLD:
+  case LoongArch::VST:
+  case LoongArch::XVLD:
+  case LoongArch::XVST:
+  case LoongArch::VLDREPL_B:
+  case LoongArch::XVLDREPL_B:
+    OT = Imm12;
+    break;
+  case LoongArch::VLDREPL_H:
+  case LoongArch::XVLDREPL_H:
+    OT = Imm11Shift1;
+    break;
+  case LoongArch::VLDREPL_W:
+  case LoongArch::XVLDREPL_W:
+    OT = Imm10Shift2;
+    break;
+  case LoongArch::VLDREPL_D:
+  case LoongArch::XVLDREPL_D:
+    OT = Imm9Shift3;
+    break;
+  case LoongArch::VSTELM_B:
+  case LoongArch::XVSTELM_B:
+    OT = Imm8;
+    break;
+  case LoongArch::VSTELM_H:
+  case LoongArch::XVSTELM_H:
+    OT = Imm8Shift1;
+    break;
+  case LoongArch::VSTELM_W:
+  case LoongArch::XVSTELM_W:
+    OT = Imm8Shift2;
+    break;
+  case LoongArch::VSTELM_D:
+  case LoongArch::XVSTELM_D:
+    OT = Imm8Shift3;
+    break;
+  }
+
+  if (MemI.getOperand(0).getReg() == Reg)
+    return false;
+
+  if ((AddrI.getOpcode() != LoongArch::ADDI_W &&
+       AddrI.getOpcode() != LoongArch::ADDI_D) ||
+      !AddrI.getOperand(1).isReg() || !AddrI.getOperand(2).isImm())
+    return false;
+
+  int64_t OldOffset = MemI.getOperand(2).getImm();
+  int64_t Disp = AddrI.getOperand(2).getImm();
+  int64_t NewOffset = OldOffset + Disp;
+  if (!STI.is64Bit())
+    NewOffset = SignExtend64<32>(NewOffset);
+
+  if (!(OT == Imm14Shift2 && isShiftedInt<14, 2>(NewOffset) && STI.hasUAL()) &&
+      !(OT == Imm12 && isInt<12>(NewOffset)) &&
+      !(OT == Imm11Shift1 && isShiftedInt<11, 1>(NewOffset)) &&
+      !(OT == Imm10Shift2 && isShiftedInt<10, 2>(NewOffset)) &&
+      !(OT == Imm9Shift3 && isShiftedInt<9, 3>(NewOffset)) &&
+      !(OT == Imm8 && isInt<8>(NewOffset)) &&
+      !(OT == Imm8Shift1 && isShiftedInt<8, 1>(NewOffset)) &&
+      !(OT == Imm8Shift2 && isShiftedInt<8, 2>(NewOffset)) &&
+      !(OT == Imm8Shift3 && isShiftedInt<8, 3>(NewOffset)))
+    return false;
+
+  AM.BaseReg = AddrI.getOperand(1).getReg();
+  AM.ScaledReg = 0;
+  AM.Scale = 0;
+  AM.Displacement = NewOffset;
+  AM.Form = ExtAddrMode::Formula::Basic;
+  return true;
+}
+
+MachineInstr *
+LoongArchInstrInfo::emitLdStWithAddr(MachineInstr &MemI,
+                                     const ExtAddrMode &AM) const {
+  const DebugLoc &DL = MemI.getDebugLoc();
+  MachineBasicBlock &MBB = *MemI.getParent();
+
+  assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
+         "Addressing mode not supported for folding");
+
+  unsigned MemIOp = MemI.getOpcode();
+  switch (MemIOp) {
+  default:
+    return BuildMI(MBB, MemI, DL, get(MemIOp))
+        .addReg(MemI.getOperand(0).getReg(),
+                MemI.mayLoad() ? RegState::Define : 0)
+        .addReg(AM.BaseReg)
+        .addImm(AM.Displacement)
+        .setMemRefs(MemI.memoperands())
+        .setMIFlags(MemI.getFlags());
+  case LoongArch::VSTELM_B:
+  case LoongArch::VSTELM_H:
+  case LoongArch::VSTELM_W:
+  case LoongArch::VSTELM_D:
+  case LoongArch::XVSTELM_B:
+  case LoongArch::XVSTELM_H:
+  case LoongArch::XVSTELM_W:
+  case LoongArch::XVSTELM_D:
+    return BuildMI(MBB, MemI, DL, get(MemIOp))
+        .addReg(MemI.getOperand(0).getReg(), 0)
+        .addReg(AM.BaseReg)
+        .addImm(AM.Displacement)
+        .addImm(MemI.getOperand(3).getImm())
+        .setMemRefs(MemI.memoperands())
+        .setMIFlags(MemI.getFlags());
+  }
+}
+
 // Returns true if this is the sext.w pattern, addi.w rd, rs, 0.
 bool LoongArch::isSEXT_W(const MachineInstr &MI) {
   return MI.getOpcode() == LoongArch::ADDI_W && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index f25958a..f69a558 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -93,6 +93,12 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
+  bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg,
+                           const MachineInstr &AddrI,
+                           ExtAddrMode &AM) const override;
+  MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
+                                 const ExtAddrMode &AM) const override;
+
 protected:
   const LoongArchSubtarget &STI;
 };
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 9de4c9d..92a9388 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -62,6 +62,11 @@ static cl::opt<bool>
                           cl::desc("Enable the merge base offset pass"),
                           cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableSinkFold("loongarch-enable-sink-fold",
+                   cl::desc("Enable sinking and folding of instruction copies"),
+                   cl::init(true), cl::Hidden);
+
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
@@ -146,7 +151,9 @@ namespace {
 class LoongArchPassConfig : public TargetPassConfig {
 public:
   LoongArchPassConfig(LoongArchTargetMachine &TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {
+    setEnableSinkAndFold(EnableSinkFold);
+  }
 
   LoongArchTargetMachine &getLoongArchTargetMachine() const {
     return getTM<LoongArchTargetMachine>();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c3f100e..995ae75 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16496,32 +16496,42 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
-                               unsigned ShY) {
+                               unsigned ShY, bool AddX) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue X = N->getOperand(0);
   SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
                                DAG.getTargetConstant(ShY, DL, VT), X);
   return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                     DAG.getTargetConstant(ShX, DL, VT), Mul359);
+                     DAG.getTargetConstant(ShX, DL, VT), AddX ? X : Mul359);
 }
 
 static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
                                        uint64_t MulAmt) {
+  // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
   switch (MulAmt) {
   case 5 * 3:
-    return getShlAddShlAdd(N, DAG, 2, 1);
+    return getShlAddShlAdd(N, DAG, 2, 1, /*AddX=*/false);
   case 9 * 3:
-    return getShlAddShlAdd(N, DAG, 3, 1);
+    return getShlAddShlAdd(N, DAG, 3, 1, /*AddX=*/false);
   case 5 * 5:
-    return getShlAddShlAdd(N, DAG, 2, 2);
+    return getShlAddShlAdd(N, DAG, 2, 2, /*AddX=*/false);
   case 9 * 5:
-    return getShlAddShlAdd(N, DAG, 3, 2);
+    return getShlAddShlAdd(N, DAG, 3, 2, /*AddX=*/false);
   case 9 * 9:
-    return getShlAddShlAdd(N, DAG, 3, 3);
+    return getShlAddShlAdd(N, DAG, 3, 3, /*AddX=*/false);
   default:
-    return SDValue();
+    break;
   }
+
+  // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
+  int ShX;
+  if (int ShY = isShifted359(MulAmt - 1, ShX)) {
+    assert(ShX != 0 && "MulAmt=4,6,10 handled before");
+    if (ShX <= 3)
+      return getShlAddShlAdd(N, DAG, ShX, ShY, /*AddX=*/true);
+  }
+  return SDValue();
 }
 
 // Try to expand a scalar multiply to a faster sequence.
@@ -16581,41 +16591,30 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
                          DAG.getConstant(Shift, DL, VT));
     }
 
-    // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
-    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt))
-      return V;
+    // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
+    // of 25 which happen to be quite common.
+    // (2/4/8 * 3/5/9 + 1) * 2^N
+    Shift = llvm::countr_zero(MulAmt);
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
+      if (Shift == 0)
+        return V;
+      SDLoc DL(N);
+      return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
+    }
 
     // If this is a power 2 + 2/4/8, we can use a shift followed by a single
     // shXadd. First check if this a sum of two power of 2s because that's
     // easy. Then count how many zeros are up to the first bit.
-    if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
-      unsigned ScaleShift = llvm::countr_zero(MulAmt);
-      if (ScaleShift >= 1 && ScaleShift < 4) {
-        unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
-        SDLoc DL(N);
-        SDValue Shift1 =
-            DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                           DAG.getTargetConstant(ScaleShift, DL, VT), Shift1);
-      }
+    if (Shift >= 1 && Shift <= 3 && isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+      unsigned ShiftAmt = llvm::countr_zero((MulAmt & (MulAmt - 1)));
+      SDLoc DL(N);
+      SDValue Shift1 =
+          DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
+      return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                         DAG.getTargetConstant(Shift, DL, VT), Shift1);
     }
 
-    // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
-    // This is the two instruction form, there are also three instruction
-    // variants we could implement.  e.g.
-    //   (2^(1,2,3) * 3,5,9 + 1) << C2
-    //   2^(C1>3) * 3,5,9 +/- 1
-    if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) {
-      assert(Shift != 0 && "MulAmt=4,6,10 handled before");
-      if (Shift <= 3) {
-        SDLoc DL(N);
-        SDValue Mul359 =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getTargetConstant(ShXAmount, DL, VT), X);
-        return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                           DAG.getTargetConstant(Shift, DL, VT), X);
-      }
-    }
+    // TODO: 2^(C1>3) * 3,5,9 +/- 1
 
     // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
     if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
@@ -16647,14 +16646,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
       }
     }
-
-    // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
-    // of 25 which happen to be quite common.
-    Shift = llvm::countr_zero(MulAmt);
-    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
-      SDLoc DL(N);
-      return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
-    }
   }
 
   if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt))
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 636e31c..bf9de0a 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1583,7 +1583,10 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
               if (!TII->isAddImmediate(*DeadMI, Reg))
                 continue;
               LIS->RemoveMachineInstrFromMaps(*DeadMI);
+              Register AddReg = DeadMI->getOperand(1).getReg();
               DeadMI->eraseFromParent();
+              if (AddReg.isVirtual())
+                LIS->shrinkToUses(&LIS->getInterval(AddReg));
             }
           }
         }
@@ -1869,11 +1872,15 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
   // Loop over the dead AVL values, and delete them now.  This has
   // to be outside the above loop to avoid invalidating iterators.
   for (auto *MI : ToDelete) {
+    assert(MI->getOpcode() == RISCV::ADDI);
+    Register AddReg = MI->getOperand(1).getReg();
     if (LIS) {
       LIS->removeInterval(MI->getOperand(0).getReg());
       LIS->RemoveMachineInstrFromMaps(*MI);
     }
     MI->eraseFromParent();
+    if (LIS && AddReg.isVirtual())
+      LIS->shrinkToUses(&LIS->getInterval(AddReg));
   }
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 56a38bb..b2cbdb2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2390,6 +2390,15 @@ static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
   return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generateBlockingPipesInst(const SPIRV::IncomingCall *Call,
+                                      MachineIRBuilder &MIRBuilder,
+                                      SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+  return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+}
+
 static bool
 generateTernaryBitwiseFunctionINTELInst(const SPIRV::IncomingCall *Call,
                                         MachineIRBuilder &MIRBuilder,
@@ -3050,6 +3059,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generatePipeInst(Call.get(), MIRBuilder, GR);
   case SPIRV::PredicatedLoadStore:
     return generatePredicatedLoadStoreInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::BlockingPipes:
+    return generateBlockingPipesInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index c259cce..492a98e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -71,6 +71,7 @@ def TernaryBitwiseINTEL : BuiltinGroup;
 def Block2DLoadStore : BuiltinGroup;
 def Pipe : BuiltinGroup;
 def PredicatedLoadStore : BuiltinGroup;
+def BlockingPipes : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -1174,6 +1175,10 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
+//SPV_ALTERA_blocking_pipes
+defm : DemangledNativeBuiltin<"__spirv_WritePipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpWritePipeBlockingALTERA>;
+defm : DemangledNativeBuiltin<"__spirv_ReadPipeBlockingINTEL", OpenCL_std, BlockingPipes, 0, 0, OpReadPipeBlockingALTERA>;
 defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 43b2869..f681b0d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -159,7 +159,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
         {"SPV_KHR_maximal_reconvergence",
          SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
         {"SPV_INTEL_kernel_attributes",
-         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}};
+         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes},
+        {"SPV_ALTERA_blocking_pipes",
+         SPIRV::Extension::Extension::SPV_ALTERA_blocking_pipes}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index a61351e..03bd61b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -993,3 +993,9 @@ def OpPredicatedLoadINTEL: Op<6528, (outs ID:$res), (ins TYPE:$resType, ID:$ptr,
                   "$res = OpPredicatedLoadINTEL $resType $ptr $predicate $default_value">;
 def OpPredicatedStoreINTEL: Op<6529, (outs), (ins ID:$ptr, ID:$object, ID:$predicate, variable_ops),
                   "OpPredicatedStoreINTEL $ptr $object $predicate">;
+
+//SPV_ALTERA_blocking_pipes
+def OpReadPipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpReadPipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
+def OpWritePipeBlockingALTERA :Op<5946, (outs), (ins ID:$pipe, ID:$pointer, ID:$packetSize, ID:$packetAlignment),
+                   "OpWritePipeBlockingALTERA $pipe $pointer $packetSize $packetAlignment">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index e5ac76c4..af76016 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1885,6 +1885,13 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(
         SPIRV::Capability::CooperativeMatrixCheckedInstructionsINTEL);
     break;
+  case SPIRV::OpReadPipeBlockingALTERA:
+  case SPIRV::OpWritePipeBlockingALTERA:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_ALTERA_blocking_pipes);
+      Reqs.addCapability(SPIRV::Capability::BlockingPipesALTERA);
+    }
+    break;
   case SPIRV::OpCooperativeMatrixGetElementCoordINTEL:
     if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_joint_matrix))
       report_fatal_error("OpCooperativeMatrixGetElementCoordINTEL requires the "
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 4e4e6fb..be88f33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -56,6 +56,13 @@ public:
   }
 };
 
+static cl::list<std::string> SPVAllowUnknownIntrinsics(
+    "spv-allow-unknown-intrinsics", cl::CommaSeparated,
+    cl::desc("Emit unknown intrinsics as calls to external functions. A "
+             "comma-separated input list of intrinsic prefixes must be "
+             "provided, and only intrinsics carrying a listed prefix get "
+             "emitted as described."),
+    cl::value_desc("intrinsic_prefix_0,intrinsic_prefix_1"), cl::ValueOptional);
 } // namespace
 
 char SPIRVPrepareFunctions::ID = 0;
@@ -445,6 +452,15 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
                                        EraseFromParent);
         Changed = true;
         break;
+      default:
+        if (TM.getTargetTriple().getVendor() == Triple::AMD ||
+            any_of(SPVAllowUnknownIntrinsics, [II](auto &&Prefix) {
+              if (Prefix.empty())
+                return false;
+              return II->getCalledFunction()->getName().starts_with(Prefix);
+            }))
+          Changed |= lowerIntrinsicToFunction(II);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 1b4b29b..65a8885 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -309,7 +309,7 @@ defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
 defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
 defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
-defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
+defm SPV_ALTERA_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
 defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
 defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
 defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
@@ -611,6 +611,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso
 defm BFloat16TypeKHR : CapabilityOperand<5116, 0, 0, [SPV_KHR_bfloat16], []>;
 defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>;
 defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>;
+defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index af32298..fc6c290 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -216,7 +216,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Combine fp_to_{s,u}int_sat or fp_round of concat_vectors or vice versa
     // into conversion ops
     setTargetDAGCombine({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
-                         ISD::FP_ROUND, ISD::CONCAT_VECTORS});
+                         ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_ROUND,
+                         ISD::CONCAT_VECTORS});
 
     setTargetDAGCombine(ISD::TRUNCATE);
 
@@ -3580,6 +3581,64 @@ static SDValue performMulCombine(SDNode *N,
   }
 }
 
+SDValue DoubleVectorWidth(SDValue In, unsigned RequiredNumElems,
+                          SelectionDAG &DAG) {
+  SDLoc DL(In);
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = InVT.getVectorNumElements() * 2;
+  EVT OutVT = EVT::getVectorVT(Ctx, InVT.getVectorElementType(), NumElems);
+  SDValue Concat =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, In, DAG.getPOISON(InVT));
+  if (NumElems < RequiredNumElems) {
+    return DoubleVectorWidth(Concat, RequiredNumElems, DAG);
+  }
+  return Concat;
+}
+
+SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT OutVT = N->getValueType(0);
+  if (!OutVT.isVector())
+    return SDValue();
+
+  EVT OutElTy = OutVT.getVectorElementType();
+  if (OutElTy != MVT::i8 && OutElTy != MVT::i16)
+    return SDValue();
+
+  unsigned NumElems = OutVT.getVectorNumElements();
+  if (!isPowerOf2_32(NumElems))
+    return SDValue();
+
+  EVT FPVT = N->getOperand(0)->getValueType(0);
+  if (FPVT.getVectorElementType() != MVT::f32)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // First, convert to i32.
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT IntVT = EVT::getVectorVT(Ctx, MVT::i32, NumElems);
+  SDValue ToInt = DAG.getNode(N->getOpcode(), DL, IntVT, N->getOperand(0));
+  APInt Mask = APInt::getLowBitsSet(IntVT.getScalarSizeInBits(),
+                                    OutVT.getScalarSizeInBits());
+  // Mask out the top MSBs.
+  SDValue Masked =
+      DAG.getNode(ISD::AND, DL, IntVT, ToInt, DAG.getConstant(Mask, DL, IntVT));
+
+  if (OutVT.getSizeInBits() < 128) {
+    // Create a wide enough vector that we can use narrow.
+    EVT NarrowedVT = OutElTy == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
+    unsigned NumRequiredElems = NarrowedVT.getVectorNumElements();
+    SDValue WideVector = DoubleVectorWidth(Masked, NumRequiredElems, DAG);
+    SDValue Trunc = truncateVectorWithNARROW(NarrowedVT, WideVector, DL, DAG);
+    return DAG.getBitcast(
+        OutVT, extractSubVector(Trunc, 0, DAG, DL, OutVT.getSizeInBits()));
+  } else {
+    return truncateVectorWithNARROW(OutVT, Masked, DL, DAG);
+  }
+  return SDValue();
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -3606,6 +3665,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND:
   case ISD::CONCAT_VECTORS:
     return performVectorTruncZeroCombine(N, DCI);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return performConvertFPCombine(N, DCI.DAG);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4d44227b3..168e041 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53442,7 +53442,8 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   }
 
   SDValue NewStore =
-      DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+      DAG.getStore(St->getChain(), DL, Res, NewPtr,
+                   MachinePointerInfo(St->getPointerInfo().getAddrSpace()),
                    Align(), St->getMemOperand()->getFlags());
 
   // If there are other uses of StoredVal, replace with a new load of the
@@ -54639,7 +54640,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
         SDValue NewPtr = DAG.getMemBasePlusOffset(
             Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
         SDValue NewLoad =
-            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr,
+                        MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()),
                         Align(), Ld->getMemOperand()->getFlags());
         DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
         return NewLoad;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5c3f17..906fa2f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7550,13 +7550,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 Load->getAlign(), VPIRMetadata(*Load, LVer),
-                                 I->getDebugLoc());
+                                 VPIRMetadata(*Load, LVer), I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, Store->getAlign(),
-                                VPIRMetadata(*Store, LVer), I->getDebugLoc());
+                                Reverse, VPIRMetadata(*Store, LVer),
+                                I->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 22ea083..3062e1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1163,10 +1163,10 @@ public:
   bool opcodeMayReadOrWriteFromMemory() const;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override;
+  bool usesFirstPartOnly(const VPValue *Op) const override;
 
   /// Returns true if this VPInstruction produces a scalar value from a vector,
   /// e.g. by performing a reduction or extracting a lane.
@@ -1393,13 +1393,13 @@ public:
     return true;
   }
 
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1628,7 +1628,7 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override;
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 };
 
 /// A recipe for widening Call instructions using library calls.
@@ -1767,7 +1767,7 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getCond() && isInvariantCond();
@@ -1833,7 +1833,7 @@ public:
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getOperand(0))
@@ -1870,7 +1870,7 @@ public:
 
   void execute(VPTransformState &State) override;
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -1884,7 +1884,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -1922,14 +1922,14 @@ public:
 
   Type *getSourceElementType() const { return SourceElementTy; }
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     assert(getNumOperands() <= 2 && "must have at most two operands");
@@ -2110,7 +2110,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // The recipe creates its own wide start value, so it only requests the
@@ -2325,7 +2325,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getStartValue();
@@ -2399,7 +2399,7 @@ public:
   bool isInLoop() const { return IsInLoop; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isOrdered() || isInLoop();
@@ -2468,13 +2468,13 @@ public:
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Recursing through Blend recipes only, must terminate at header phi's the
     // latest.
     return all_of(users(),
-                  [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
+                  [this](VPUser *U) { return U->usesFirstLaneOnly(this); });
   }
 };
 
@@ -2562,7 +2562,7 @@ public:
                               VPCostContext &Ctx) const override;
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override = 0;
+  bool usesFirstLaneOnly(const VPValue *Op) const override = 0;
 
   /// Returns the number of stored operands of this interleave group. Returns 0
   /// for load interleave groups.
@@ -2608,7 +2608,7 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
@@ -2656,7 +2656,7 @@ public:
 #endif
 
   /// The recipe only uses the first lane of the address, and EVL operand.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) ||
@@ -2862,7 +2862,7 @@ public:
   VPValue *getEVL() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getEVL();
@@ -2924,7 +2924,7 @@ public:
   bool isPredicated() const { return IsPredicated; }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return isSingleScalar();
@@ -3206,14 +3206,14 @@ protected:
 
   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
                       std::initializer_list<VPValue *> Operands,
-                      bool Consecutive, bool Reverse, Align Alignment,
+                      bool Consecutive, bool Reverse,
                       const VPIRMetadata &Metadata, DebugLoc DL)
       : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
-        Alignment(Alignment), Consecutive(Consecutive), Reverse(Reverse) {
+        Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
+        Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
-    assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
-           !Reverse &&
-               "Reversed acccess without VPVectorEndPointerRecipe address?");
+    assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) &&
+           "Reversed acccess without VPVectorEndPointerRecipe address?");
   }
 
 public:
@@ -3273,18 +3273,18 @@ public:
 struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
                                                    public VPValue {
   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                    bool Consecutive, bool Reverse, Align Alignment,
+                    bool Consecutive, bool Reverse,
                     const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
-                            Reverse, Alignment, Metadata, DL),
+                            Reverse, Metadata, DL),
         VPValue(this, &Load) {
     setMask(Mask);
   }
 
   VPWidenLoadRecipe *clone() override {
     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
-                                 getMask(), Consecutive, Reverse, getAlign(),
-                                 *this, getDebugLoc());
+                                 getMask(), Consecutive, Reverse, *this,
+                                 getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3299,7 +3299,7 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive loads operations only demand the first lane of
@@ -3315,8 +3315,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
                        VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
-                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
-                            L.getAlign(), L, L.getDebugLoc()),
+                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+                            L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -3340,7 +3340,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened loads only demand the first lane of EVL and consecutive loads
@@ -3354,16 +3354,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
                      VPValue *Mask, bool Consecutive, bool Reverse,
-                     Align Alignment, const VPIRMetadata &Metadata, DebugLoc DL)
+                     const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
-                            Consecutive, Reverse, Alignment, Metadata, DL) {
+                            Consecutive, Reverse, Metadata, DL) {
     setMask(Mask);
   }
 
   VPWidenStoreRecipe *clone() override {
     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
                                   getStoredValue(), getMask(), Consecutive,
-                                  Reverse, getAlign(), *this, getDebugLoc());
+                                  Reverse, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3381,7 +3381,7 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     // Widened, consecutive stores only demand the first lane of their address,
@@ -3398,7 +3398,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
                         VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
                             {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
-                            S.isReverse(), S.getAlign(), S, S.getDebugLoc()) {
+                            S.isReverse(), S, S.getDebugLoc()) {
     setMask(Mask);
   }
 
@@ -3424,7 +3424,7 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
 #endif
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     if (Op == getEVL()) {
@@ -3508,14 +3508,14 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
 
   /// Returns true if the recipe only uses the first part of operand \p Op.
-  bool onlyFirstPartUsed(const VPValue *Op) const override {
+  bool usesFirstPartOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3590,7 +3590,7 @@ public:
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3700,7 +3700,7 @@ public:
   VPValue *getStepValue() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
@@ -3765,7 +3765,7 @@ public:
   VPValue *getStepValue() const { return getOperand(1); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+  bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f792d0a..80cd112 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1276,7 +1276,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   }
 }
 
-bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
     return vputils::onlyFirstLaneUsed(this);
@@ -1325,7 +1325,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   llvm_unreachable("switch should return");
 }
 
-bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
+bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
     return vputils::onlyFirstPartUsed(this);
@@ -1692,7 +1692,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
     if (!VFTy->getParamType(I.index())->isVectorTy())
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     Args.push_back(Arg);
   }
 
@@ -1761,7 +1761,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
                                            State.TTI))
       Arg = State.get(I.value(), VPLane(0));
     else
-      Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
+      Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
     if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
                                                State.TTI))
       TysForDecl.push_back(Arg->getType());
@@ -1843,7 +1843,7 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }
 
-bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   return all_of(enumerate(operands()), [this, &Op](const auto &X) {
     auto [Idx, V] = X;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8ad772f..48bd697 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,14 +91,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
-              VPIRMetadata(*Load), Ingredient.getDebugLoc());
+              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+              Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              Store->getAlign(), VPIRMetadata(*Store),
-              Ingredient.getDebugLoc());
+              VPIRMetadata(*Store), Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -205,7 +204,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
           return cast<VPRecipeBase>(U)->getParent() != SinkTo;
         });
     if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
-          return !U->onlyFirstLaneUsed(SinkCandidate);
+          return !U->usesFirstLaneOnly(SinkCandidate);
         }))
       continue;
     bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
@@ -4208,7 +4207,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
     auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
     auto *L = new VPWidenLoadRecipe(
         *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
-        /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
+        /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
     L->insertBefore(LoadGroup);
     NarrowedOps.insert(L);
     return L;
@@ -4345,7 +4344,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
         cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
     auto *S = new VPWidenStoreRecipe(
         *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
-        /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
+        /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
     S->insertBefore(StoreGroup);
     StoreGroup->eraseFromParent();
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d6a0028..d4b8b72b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -582,7 +582,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
       DefR->replaceUsesWithIf(LaneDefs[0], [DefR](VPUser &U, unsigned) {
-        return U.onlyFirstLaneUsed(DefR);
+        return U.usesFirstLaneOnly(DefR);
       });
 
       // Update each build vector user that currently has DefR as its only
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c6380d3..e22c5df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -18,12 +18,12 @@ using namespace llvm::VPlanPatternMatch;
 
 bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstLaneOnly(Def); });
 }
 
 bool vputils::onlyFirstPartUsed(const VPValue *Def) {
   return all_of(Def->users(),
-                [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
+                [Def](const VPUser *U) { return U->usesFirstPartOnly(Def); });
 }
 
 bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 83e3fca..5da7463 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -274,12 +274,12 @@ public:
   virtual bool usesScalars(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
-    return onlyFirstLaneUsed(Op);
+    return usesFirstLaneOnly(Op);
   }
 
   /// Returns true if the VPUser only uses the first lane of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstLaneUsed(const VPValue *Op) const {
+  virtual bool usesFirstLaneOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
@@ -287,7 +287,7 @@ public:
 
   /// Returns true if the VPUser only uses the first part of operand \p Op.
   /// Conservatively returns false.
-  virtual bool onlyFirstPartUsed(const VPValue *Op) const {
+  virtual bool usesFirstPartOnly(const VPValue *Op) const {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return false;
diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
new file mode 100644
index 0000000..e784ead
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -0,0 +1,315 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = bitcast <16 x i1> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i32> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i32> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %mask = icmp slt <1 x i64> %a, %b
+  %t1 = bitcast <1 x i1> %mask to i1
+  %t2 = call i1 @llvm.ctpop(i1 %t1)
+  %t3 = zext i1 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i64> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i32
+  ret i32 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i8> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <16 x i8> %a, %b
+  %t1 = bitcast <16 x i1> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i16> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <8 x i16> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i32> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <4 x i32> %a, %b
+  %t1 = bitcast <4 x i1> %mask to i4
+  %t2 = call i4 @llvm.ctpop(i4 %t1)
+  %t3 = zext i4 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %mask = icmp slt <1 x i64> %a, %b
+  %t1 = bitcast <1 x i1> %mask to i1
+  %t2 = call i1 @llvm.ctpop(i1 %t1)
+  %t3 = zext i1 %t2 to i64
+  ret i64 %t3
+}
+
+define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vmask_popcount_i64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp slt <2 x i64> %a, %b
+  %t1 = bitcast <2 x i1> %mask to i2
+  %t2 = call i2 @llvm.ctpop(i2 %t1)
+  %t3 = zext i2 %t2 to i64
+  ret i64 %t3
+}
+
+define i32 @non_vmask_popcount_1(half %a) {
+; CHECK-LABEL: non_vmask_popcount_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %t1 = bitcast half %a to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
+
+define i32 @non_vmask_popcount_2(<8 x i16> %a) {
+; CHECK-LABEL: non_vmask_popcount_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    umov w9, v0.b[1]
+; CHECK-NEXT:    umov w10, v0.b[2]
+; CHECK-NEXT:    and w8, w8, #0x3
+; CHECK-NEXT:    bfi w8, w9, #2, #2
+; CHECK-NEXT:    umov w9, v0.b[3]
+; CHECK-NEXT:    bfi w8, w10, #4, #2
+; CHECK-NEXT:    umov w10, v0.b[4]
+; CHECK-NEXT:    bfi w8, w9, #6, #2
+; CHECK-NEXT:    umov w9, v0.b[5]
+; CHECK-NEXT:    bfi w8, w10, #8, #2
+; CHECK-NEXT:    umov w10, v0.b[6]
+; CHECK-NEXT:    bfi w8, w9, #10, #2
+; CHECK-NEXT:    umov w9, v0.b[7]
+; CHECK-NEXT:    bfi w8, w10, #12, #2
+; CHECK-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %mask = trunc <8 x i16> %a to <8 x i2>
+  %t1 = bitcast <8 x i2> %mask to i16
+  %t2 = call i16 @llvm.ctpop(i16 %t1)
+  %t3 = zext i16 %t2 to i32
+  ret i32 %t3
+}
diff --git a/llvm/test/CodeGen/AArch64/vector-minmax.ll b/llvm/test/CodeGen/AArch64/vector-minmax.ll
new file mode 100644
index 0000000..6696f94
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-minmax.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon,+sve | FileCheck %s --check-prefix=CHECK-SVE
+
+define <2 x i64> @smax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: smax_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @smin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: smin_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smin_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @umax_v2i64(<2 x i64> %a, <2 x i64> %b){
+; CHECK-LABEL: umax_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmhi v2.2d, v0.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umax_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @umin_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: umin_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmhi v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umin_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-SVE-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %0
+}
+
+define <1 x i64> @smax_v1i64(<1 x i64> %a, <1 x i64> %b){
+; CHECK-LABEL: smax_v1i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt d2, d0, d1
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v1i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ptrue p0.d, vl1
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-SVE-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <1 x i64> @llvm.smax.v2i64(<1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %0
+}
+
+; This is legal for Neon, so this should use the Neon smax.
+define <4 x i32> @smax_v4i32(<4 x i32> %a, <4 x i32> %b){
+; CHECK-LABEL: smax_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smax_v4i32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-SVE-NEXT:    ret
+entry:
+  %0 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index b2635d3..3685e9c 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,111 +730,6 @@ entry:
   ret void
 }
 
-define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
-                                     <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
-; CHECK-LABEL: store_factor8:
-; CHECK:       .Lfunc_begin17:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0:
-; CHECK:  zip1	[[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
-; CHECK-NEXT:  zip2	[[V5:.*s]], [[I1]], [[I5]]
-; CHECK-NEXT:  zip1	[[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
-; CHECK-NEXT:  zip2 [[V6:.*s]], [[I2]], [[I6]]
-; CHECK-NEXT:  zip1	[[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
-; CHECK-NEXT:  zip2	[[V7:.*s]], [[I3]], [[I7]]
-; CHECK-NEXT:  zip1	[[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
-; CHECK-NEXT:  zip2	[[V8:.*s]], [[I4]], [[I8]]
-; CHECK-NEXT:  st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
-; CHECK-NEXT:  st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
-; CHECK-NEXT:  ret
-
-  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-
-  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-  store <32 x i32> %interleaved.vec, ptr %ptr, align 4
-  ret void
-}
-
-define void @store_factor16(ptr %ptr, <4 x i32> %a0,  <4 x i32> %a1,  <4 x i32> %a2,  <4 x i32> %a3,
-                                      <4 x i32> %a4,  <4 x i32> %a5,  <4 x i32> %a6,  <4 x i32> %a7,
-                                      <4 x i32> %a8,  <4 x i32> %a9,  <4 x i32> %a10, <4 x i32> %a11,
-                                      <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
-; CHECK-LABEL: store_factor16:
-; CHECK:       .Lfunc_begin18:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0:
-; CHECK:      	zip1	[[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
-; CHECK-NEXT:  	zip1	[[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
-; CHECK-NEXT:  	zip1	[[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
-; CHECK-NEXT:  	zip1	[[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
-; CHECK-NEXT:  	zip1	[[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
-; CHECK-NEXT:  	zip2	[[V09:.*s]], [[I01]], [[I09]]
-; CHECK-NEXT:  	zip2	[[V13:.*s]], [[I05]], [[I13]]
-; CHECK-NEXT:  	zip1	[[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
-; CHECK-NEXT:  	zip1	[[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
-; CHECK-NEXT:  	zip1	[[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
-; CHECK-NEXT:  	zip2	[[V10:.*s]], [[I02]], [[I10]]
-; CHECK-NEXT:  	zip2	[[V14:.*s]], [[I06]], [[I14]]
-; CHECK-NEXT:  	zip2	[[V11:.*s]], [[I03]], [[I11]]
-; CHECK-NEXT:  	zip1	[[V17:.*s]], [[V01]], [[V05]]
-; CHECK-NEXT:  	zip2	[[V15:.*s]], [[I07]], [[I15]]
-; CHECK-NEXT:  	zip2	[[V21:.*s]], [[V01]], [[V05]]
-; CHECK-NEXT:  	zip1	[[V18:.*s]], [[V02]], [[V06]]
-; CHECK-NEXT:  	zip2	[[V12:.*s]], [[I04]], [[I12]]
-; CHECK-NEXT:  	zip2	[[V16:.*s]], [[I08]], [[I16]]
-; CHECK-NEXT:  	zip1	[[V19:.*s]], [[V03]], [[V07]]
-; CHECK-NEXT:  	zip2	[[V22:.*s]], [[V02]], [[V06]]
-; CHECK-NEXT:  	zip1	[[V25:.*s]], [[V09]], [[V13]]
-; CHECK-NEXT:  	zip1	[[V20:.*s]], [[V04]], [[V08]]
-; CHECK-NEXT:  	zip2	[[V23:.*s]], [[V03]], [[V07]]
-; CHECK-NEXT:  	zip1	[[V26:.*s]], [[V10]], [[V14]]
-; CHECK-NEXT:  	zip2	[[V29:.*s]], [[V09]], [[V13]]
-; CHECK-NEXT:  	zip2	[[V24:.*s]], [[V04]], [[V08]]
-; CHECK-NEXT:  	zip1	[[V27:.*s]], [[V11]], [[V15]]
-; CHECK-NEXT:  	zip2	[[V30:.*s]], [[V10]], [[V14]]
-; CHECK-NEXT:  	zip1	[[V28:.*s]], [[V12]], [[V16]]
-; CHECK-NEXT:  	zip2	[[V31:.*s]], [[V11]], [[V15]]
-; CHECK-NEXT:  	zip2	[[V32:.*s]], [[V12]], [[V16]]
-; CHECK-NEXT:  	st4	{ [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
-; CHECK-NEXT:  	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
-; CHECK-NEXT:  	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
-; CHECK-NEXT:  	st4	{ [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
-; CHECK-NEXT:  	add	x8, x0, #128
-; CHECK-NEXT:  	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
-; CHECK-NEXT:  	st4	{ [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
-; CHECK-NEXT:  	add	x8, x0, #192
-; CHECK-NEXT:  	st4	{ [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
-; CHECK-NEXT:  	ldp	d15, d14, [sp], #64             // 16-byte Folded Reload
-; CHECK-NEXT:  	ret
-
-  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-
-  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-
-  %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32>  <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-  store <64 x i32> %interleaved.vec, ptr %ptr, align 4
-  ret void
-}
-
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/ARM/ldexp-fp128.ll b/llvm/test/CodeGen/ARM/ldexp-fp128.ll
new file mode 100644
index 0000000..93fcd39e8
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ldexp-fp128.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck -check-prefix=LINUX %s
+
+define fp128 @testExpl(fp128 %val, i32 %a) {
+; LINUX-LABEL: testExpl:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r11, lr}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    ldr r12, [sp, #16]
+; LINUX-NEXT:    str r12, [sp]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    pop {r11, pc}
+  %call = tail call fp128 @ldexpl(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+declare fp128 @ldexpl(fp128, i32) memory(none)
+
+define fp128 @test_ldexp_f128_i32(fp128 %val, i32 %a) {
+; LINUX-LABEL: test_ldexp_f128_i32:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r11, lr}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    ldr r12, [sp, #16]
+; LINUX-NEXT:    str r12, [sp]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    pop {r11, pc}
+  %call = tail call fp128 @llvm.ldexp.f128.i32(fp128 %val, i32 %a)
+  ret fp128 %call
+}
+
+define <2 x fp128> @test_ldexp_v2f128_v2i32(<2 x fp128> %val, <2 x i32> %a) {
+; LINUX-LABEL: test_ldexp_v2f128_v2i32:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    push {r4, r5, r6, lr}
+; LINUX-NEXT:    vpush {d8}
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    mov r5, r3
+; LINUX-NEXT:    add r3, sp, #40
+; LINUX-NEXT:    mov r6, r2
+; LINUX-NEXT:    mov r4, r0
+; LINUX-NEXT:    ldm r3, {r0, r1, r2, r3}
+; LINUX-NEXT:    vldr d8, [sp, #56]
+; LINUX-NEXT:    vst1.32 {d8[1]}, [sp:32]
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    ldr r12, [sp, #32]
+; LINUX-NEXT:    vst1.32 {d8[0]}, [sp:32]
+; LINUX-NEXT:    ldr lr, [sp, #36]
+; LINUX-NEXT:    str r0, [r4, #16]
+; LINUX-NEXT:    mov r0, r6
+; LINUX-NEXT:    str r1, [r4, #20]
+; LINUX-NEXT:    mov r1, r5
+; LINUX-NEXT:    str r2, [r4, #24]
+; LINUX-NEXT:    mov r2, r12
+; LINUX-NEXT:    str r3, [r4, #28]
+; LINUX-NEXT:    mov r3, lr
+; LINUX-NEXT:    bl ldexpl
+; LINUX-NEXT:    stm r4, {r0, r1, r2, r3}
+; LINUX-NEXT:    add sp, sp, #8
+; LINUX-NEXT:    vpop {d8}
+; LINUX-NEXT:    pop {r4, r5, r6, pc}
+  %call = tail call <2 x fp128> @llvm.ldexp.v2f128.v2i32(<2 x fp128> %val, <2 x i32> %a)
+  ret <2 x fp128> %call
+}
diff --git a/llvm/test/CodeGen/LoongArch/ldptr.ll b/llvm/test/CodeGen/LoongArch/ldptr.ll
index c3656a6..9bafa10 100644
--- a/llvm/test/CodeGen/LoongArch/ldptr.ll
+++ b/llvm/test/CodeGen/LoongArch/ldptr.ll
@@ -24,8 +24,7 @@ define signext i32 @ldptr_w(ptr %p) nounwind {
 ; LA32-LABEL: ldptr_w:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: ldptr_w:
@@ -81,10 +80,9 @@ entry:
 define i64 @ldptr_d(ptr %p) nounwind {
 ; LA32-LABEL: ldptr_d:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a1, $a0, 1
-; LA32-NEXT:    ld.w $a0, $a1, 0
-; LA32-NEXT:    ld.w $a1, $a1, 4
+; LA32-NEXT:    addi.w $a1, $a0, 2047
+; LA32-NEXT:    ld.w $a0, $a1, 1
+; LA32-NEXT:    ld.w $a1, $a1, 5
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: ldptr_d:
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
index 9a806a1..93f73e5 100644
--- a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -25,14 +25,13 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB0_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -45,8 +44,8 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    ld.w $a0, $s2, 4
-; LA32-NEXT:    ld.w $a1, $s2, 0
+; LA32-NEXT:    ld.w $a0, $s2, 12
+; LA32-NEXT:    ld.w $a1, $s2, 8
 ; LA32-NEXT:    add.w $a0, $a0, $s6
 ; LA32-NEXT:    add.w $s3, $a1, $s3
 ; LA32-NEXT:    sltu $a1, $s3, $a1
@@ -63,8 +62,8 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s3, $zero
 ; LA32-NEXT:    move $s6, $zero
 ; LA32-NEXT:  .LBB0_4: # %for.cond.cleanup
-; LA32-NEXT:    st.w $s3, $s2, 0
-; LA32-NEXT:    st.w $s6, $s2, 4
+; LA32-NEXT:    st.w $s3, $s2, 8
+; LA32-NEXT:    st.w $s6, $s2, 12
 ; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s5, $sp, 16 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
@@ -88,8 +87,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB0_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -100,7 +98,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    ld.d $a0, $s1, 0
+; LA64-NEXT:    ld.d $a0, $s1, 8
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    add.d $s2, $a0, $s2
 ; LA64-NEXT:    bnez $s0, .LBB0_2
@@ -108,7 +106,7 @@ define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB0_3:
 ; LA64-NEXT:    move $s2, $zero
 ; LA64-NEXT:  .LBB0_4: # %for.cond.cleanup
-; LA64-NEXT:    st.d $s2, $s1, 0
+; LA64-NEXT:    st.d $s2, $s1, 8
 ; LA64-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
@@ -153,14 +151,13 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB1_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -172,7 +169,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    fld.s $fa0, $s2, 0
+; LA32-NEXT:    fld.s $fa0, $s2, 16
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -185,7 +182,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB1_3:
 ; LA32-NEXT:    movgr2fr.w $fs0, $zero
 ; LA32-NEXT:  .LBB1_4: # %for.cond.cleanup
-; LA32-NEXT:    fst.s $fs0, $s2, 0
+; LA32-NEXT:    fst.s $fs0, $s2, 16
 ; LA32-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
@@ -208,8 +205,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB1_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -220,7 +216,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fld.s $fa0, $s1, 0
+; LA64-NEXT:    fld.s $fa0, $s1, 16
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    fadd.s $fs0, $fa0, $fs0
 ; LA64-NEXT:    bnez $s0, .LBB1_2
@@ -228,7 +224,7 @@ define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB1_3:
 ; LA64-NEXT:    movgr2fr.w $fs0, $zero
 ; LA64-NEXT:  .LBB1_4: # %for.cond.cleanup
-; LA64-NEXT:    fst.s $fs0, $s1, 0
+; LA64-NEXT:    fst.s $fs0, $s1, 16
 ; LA64-NEXT:    fld.d $fs0, $sp, 8 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
@@ -271,14 +267,13 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s0, $a3
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a0, $a0, 6
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB2_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -291,7 +286,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    vld $vr0, $s2, 0
+; LA32-NEXT:    vld $vr0, $s2, 16
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -307,7 +302,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB2_3:
 ; LA32-NEXT:    vrepli.b $vr0, 0
 ; LA32-NEXT:  .LBB2_4: # %for.cond.cleanup
-; LA32-NEXT:    vst $vr0, $s2, 0
+; LA32-NEXT:    vst $vr0, $s2, 16
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
@@ -326,8 +321,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    slli.d $a0, $a0, 6
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $a1, .LBB2_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -340,7 +334,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    vld $vr0, $s1, 0
+; LA64-NEXT:    vld $vr0, $s1, 16
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
 ; LA64-NEXT:    vadd.w $vr1, $vr0, $vr1
@@ -351,7 +345,7 @@ define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB2_3:
 ; LA64-NEXT:    vrepli.b $vr0, 0
 ; LA64-NEXT:  .LBB2_4: # %for.cond.cleanup
-; LA64-NEXT:    vst $vr0, $s1, 0
+; LA64-NEXT:    vst $vr0, $s1, 16
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
@@ -393,14 +387,13 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s0, $a3
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a0, $a0, 6
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 32
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB3_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -413,7 +406,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    xvld $xr0, $s2, 0
+; LA32-NEXT:    xvld $xr0, $s2, 32
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -429,7 +422,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB3_3:
 ; LA32-NEXT:    xvrepli.b $xr0, 0
 ; LA32-NEXT:  .LBB3_4: # %for.cond.cleanup
-; LA32-NEXT:    xvst $xr0, $s2, 0
+; LA32-NEXT:    xvst $xr0, $s2, 32
 ; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
@@ -448,8 +441,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
 ; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
 ; LA64-NEXT:    slli.d $a0, $a0, 6
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 32
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $a1, .LBB3_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -462,7 +454,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    xvld $xr0, $s1, 0
+; LA64-NEXT:    xvld $xr0, $s1, 32
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
 ; LA64-NEXT:    xvadd.h $xr1, $xr0, $xr1
@@ -473,7 +465,7 @@ define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB3_3:
 ; LA64-NEXT:    xvrepli.b $xr0, 0
 ; LA64-NEXT:  .LBB3_4: # %for.cond.cleanup
-; LA64-NEXT:    xvst $xr0, $s1, 0
+; LA64-NEXT:    xvst $xr0, $s1, 32
 ; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
@@ -516,14 +508,13 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 16
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB4_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -536,7 +527,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    vldrepl.b $vr0, $s2, 0
+; LA32-NEXT:    vldrepl.b $vr0, $s2, 16
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -552,7 +543,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB4_3:
 ; LA32-NEXT:    vrepli.b $vr0, 0
 ; LA32-NEXT:  .LBB4_4: # %for.cond.cleanup
-; LA32-NEXT:    vstelm.b $vr0, $s2, 0, 1
+; LA32-NEXT:    vstelm.b $vr0, $s2, 16, 1
 ; LA32-NEXT:    ld.w $s4, $sp, 20 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 24 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 28 # 4-byte Folded Reload
@@ -573,8 +564,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 16
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB4_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -586,7 +576,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    vldrepl.b $vr0, $s1, 0
+; LA64-NEXT:    vldrepl.b $vr0, $s1, 16
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
 ; LA64-NEXT:    vadd.b $vr1, $vr0, $vr1
@@ -597,7 +587,7 @@ define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB4_3:
 ; LA64-NEXT:    vrepli.b $vr0, 0
 ; LA64-NEXT:  .LBB4_4: # %for.cond.cleanup
-; LA64-NEXT:    vstelm.b $vr0, $s1, 0, 1
+; LA64-NEXT:    vstelm.b $vr0, $s1, 16, 1
 ; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
@@ -643,14 +633,13 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    move $s1, $a2
 ; LA32-NEXT:    slli.w $a1, $a0, 4
 ; LA32-NEXT:    alsl.w $a0, $a0, $a1, 3
-; LA32-NEXT:    add.w $a0, $a4, $a0
 ; LA32-NEXT:    sltui $a1, $a3, 1
 ; LA32-NEXT:    slti $a2, $a3, 0
 ; LA32-NEXT:    masknez $a2, $a2, $a1
 ; LA32-NEXT:    sltui $a3, $s1, 1
 ; LA32-NEXT:    maskeqz $a1, $a3, $a1
 ; LA32-NEXT:    or $a1, $a1, $a2
-; LA32-NEXT:    addi.w $s2, $a0, 8
+; LA32-NEXT:    add.w $s2, $a4, $a0
 ; LA32-NEXT:    bnez $a1, .LBB5_3
 ; LA32-NEXT:  # %bb.1: # %for.body.preheader
 ; LA32-NEXT:    move $fp, $a4
@@ -663,7 +652,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl f
-; LA32-NEXT:    xvldrepl.d $xr0, $s2, 0
+; LA32-NEXT:    xvldrepl.d $xr0, $s2, 8
 ; LA32-NEXT:    addi.w $s3, $s3, 1
 ; LA32-NEXT:    sltui $a0, $s3, 1
 ; LA32-NEXT:    add.w $s4, $s4, $a0
@@ -679,7 +668,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA32-NEXT:  .LBB5_3:
 ; LA32-NEXT:    xvrepli.b $xr0, 0
 ; LA32-NEXT:  .LBB5_4: # %for.cond.cleanup
-; LA32-NEXT:    xvstelm.d $xr0, $s2, 0, 1
+; LA32-NEXT:    xvstelm.d $xr0, $s2, 8, 1
 ; LA32-NEXT:    ld.w $s4, $sp, 52 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s3, $sp, 56 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s2, $sp, 60 # 4-byte Folded Reload
@@ -700,8 +689,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $s0, $a1
 ; LA64-NEXT:    slli.d $a1, $a0, 4
 ; LA64-NEXT:    alsl.d $a0, $a0, $a1, 3
-; LA64-NEXT:    add.d $a0, $a2, $a0
-; LA64-NEXT:    addi.d $s1, $a0, 8
+; LA64-NEXT:    add.d $s1, $a2, $a0
 ; LA64-NEXT:    blez $s0, .LBB5_3
 ; LA64-NEXT:  # %bb.1: # %for.body.preheader
 ; LA64-NEXT:    move $fp, $a2
@@ -713,7 +701,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    xvldrepl.d $xr0, $s1, 0
+; LA64-NEXT:    xvldrepl.d $xr0, $s1, 8
 ; LA64-NEXT:    addi.d $s0, $s0, -1
 ; LA64-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
 ; LA64-NEXT:    xvfadd.d $xr1, $xr0, $xr1
@@ -724,7 +712,7 @@ define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
 ; LA64-NEXT:  .LBB5_3:
 ; LA64-NEXT:    xvrepli.b $xr0, 0
 ; LA64-NEXT:  .LBB5_4: # %for.cond.cleanup
-; LA64-NEXT:    xvstelm.d $xr0, $s1, 0, 1
+; LA64-NEXT:    xvstelm.d $xr0, $s1, 8, 1
 ; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
 ; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/stptr.ll b/llvm/test/CodeGen/LoongArch/stptr.ll
index d70f9f4..23b433a 100644
--- a/llvm/test/CodeGen/LoongArch/stptr.ll
+++ b/llvm/test/CodeGen/LoongArch/stptr.ll
@@ -23,8 +23,7 @@ define void @stptr_w(ptr %p, i32 signext %val) nounwind {
 ; LA32-LABEL: stptr_w:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: stptr_w:
@@ -77,9 +76,8 @@ define void @stptr_d(ptr %p, i64 %val) nounwind {
 ; LA32-LABEL: stptr_d:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 1
-; LA32-NEXT:    st.w $a2, $a0, 4
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a2, $a0, 5
+; LA32-NEXT:    st.w $a1, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: stptr_d:
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 50bd22b..f4964288 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -205,12 +205,19 @@ define i64 @addmul20(i64 %a, i64 %b) {
 }
 
 define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 22
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addmul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 22
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: addmul22:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a2, a0, a0, 2
+; RV64XTHEADBA-NEXT:    th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT:    th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT:    ret
   %c = mul i64 %a, 22
   %d = add i64 %c, %b
   ret i64 %d
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 7fd7626..d4b2288 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -585,6 +585,33 @@ define i64 @addmul12(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul14(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul14:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul14:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul14:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 14
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul18(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul18:
 ; RV64I:       # %bb.0:
@@ -636,12 +663,26 @@ define i64 @addmul20(i64 %a, i64 %b) {
 }
 
 define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 22
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: addmul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 22
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul22:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul22:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
   %c = mul i64 %a, 22
   %d = add i64 %c, %b
   ret i64 %d
@@ -672,6 +713,32 @@ define i64 @addmul24(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul26(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul26:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 26
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul26:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul26:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 26
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul36(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul36:
 ; RV64I:       # %bb.0:
@@ -722,6 +789,58 @@ define i64 @addmul40(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul38(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul38:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 38
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul38:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul38:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 38
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul42(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul42:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 42
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul42:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul42:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 42
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @addmul72(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul72:
 ; RV64I:       # %bb.0:
@@ -747,6 +866,84 @@ define i64 @addmul72(i64 %a, i64 %b) {
   ret i64 %d
 }
 
+define i64 @addmul74(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul74:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 74
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul74:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul74:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 74
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul82(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul82:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 82
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul82:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a2, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul82:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 82
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul146(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul146:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 146
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul146:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a2, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a2, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: addmul146:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = mul i64 %a, 146
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
 define i64 @mul50(i64 %a) {
 ; RV64I-LABEL: mul50:
 ; RV64I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index 20034b6..b6e29cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -863,3 +863,19 @@ entry:
     i64 2)
   ret <vscale x 1 x double> %2
 }
+
+; The two vsetvlis will be coalesced so the add will be made dead and
+; removed. Make sure we shrink the live interval of %x.
+define void @non_li_addi(i64 %x, ptr %p) {
+; CHECK-LABEL: non_li_addi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    ret
+entry:
+  %add = add i64 %x, 1
+  %0 = tail call i64 @llvm.riscv.vsetvli(i64 %add, i64 3, i64 0)
+  %1 = call <vscale x 8 x i8> @llvm.riscv.vle(<vscale x 8 x i8> poison, ptr %p, i64 %0)
+  %2 = tail call i64 @llvm.riscv.vsetvli(i64 1, i64 3, i64 0)
+  %3 = tail call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff(<vscale x 8 x i8> poison, ptr %p, i64 %2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
index fdd30c9..f9929c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
@@ -104,6 +104,10 @@
     ret void
   }
 
+  define void @non_li_addi() {
+    ret void
+  }
+
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
 
   declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64.i64(<vscale x 1 x i64>, ptr nocapture, i64) #4
@@ -664,3 +668,23 @@ body:             |
   bb.2:
     $x10 = COPY %vl
     PseudoRET implicit killed $x10
+...
+---
+# The two vsetvlis will be coalesced so the ADDI will be made dead and removed.
+# Make sure we shrink the live interval of %0.
+name: non_li_addi
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: non_li_addi
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoVSETIVLI:%[0-9]+]]:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:gpr = COPY $x10
+    %1:gprnox0 = ADDI %0, 1
+    %2:gprnox0 = PseudoVSETVLI %1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    %3:gprnox0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    PseudoRET
diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll
index d8e2b2c..305ab93 100644
--- a/llvm/test/CodeGen/RISCV/zicond-opts.ll
+++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll
@@ -263,3 +263,35 @@ define i64 @test_inv_and_eqz(i64 %f, i64 %x, i1 %cond) {
   %7 = and i64 %6, %f
   ret i64 %7
 }
+
+define i32 @pr166596(i32 %conv.i, i1 %iszero) #0 {
+; RV32ZICOND-LABEL: pr166596:
+; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND-NEXT:    andi a1, a1, 1
+; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    zext.h a0, a0
+; RV32ZICOND-NEXT:    clz a0, a0
+; RV32ZICOND-NEXT:    addi a0, a0, 41
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV32ZICOND-NEXT:    addi a0, a0, -9
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: pr166596:
+; RV64ZICOND:       # %bb.0: # %entry
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    xori a0, a0, 1
+; RV64ZICOND-NEXT:    zext.h a0, a0
+; RV64ZICOND-NEXT:    clz a0, a0
+; RV64ZICOND-NEXT:    addi a0, a0, 9
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    addi a0, a0, -9
+; RV64ZICOND-NEXT:    ret
+entry:
+  %not.i = xor i32 %conv.i, 1
+  %conv2.i = trunc i32 %not.i to i16
+  %conv22 = zext i16 %conv2.i to i64
+  %0 = call i64 @llvm.ctlz.i64(i64 %conv22, i1 false)
+  %cast = trunc i64 %0 to i32
+  %clzg = select i1 %iszero, i32 -9, i32 %cast
+  ret i32 %clzg
+}
diff --git a/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
new file mode 100644
index 0000000..677291a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/allow_unknown_intrinsics.ll
@@ -0,0 +1,36 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics %s -o %t.spvt 2>&1 | FileCheck -check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=notllvm %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.some.custom %s -o %t.spvt 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm.,random.prefix %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spv-allow-unknown-intrinsics=llvm. %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; The test checks command-line option which allows to represent unknown
+; intrinsics as external function calls in SPIR-V.
+
+; CHECK-ERROR: LLVM ERROR: unable to legalize instruction: %3:iid(s64) = G_READCYCLECOUNTER (in function: foo)
+
+; CHECK: Name %[[READCYCLECOUNTER:[0-9]+]] "spirv.llvm_readcyclecounter"
+; CHECK: Name %[[SOME_CUSTOM_INTRINSIC:[0-9]+]] "spirv.llvm_some_custom_intrinsic"
+; CHECK-DAG: Decorate %[[READCYCLECOUNTER]] LinkageAttributes {{.*}} Import
+; CHECK: Decorate %[[SOME_CUSTOM_INTRINSIC]] LinkageAttributes {{.*}} Import
+; CHECK-DAG: %[[I64:[0-9]+]] = OpTypeInt 64
+; CHECK: %[[FnTy:[0-9]+]] = OpTypeFunction %[[I64]]
+; CHECK: %[[READCYCLECOUNTER]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: %[[SOME_CUSTOM_INTRINSIC]] = OpFunction %[[I64]] {{.*}} %[[FnTy]]
+; CHECK-DAG: OpFunctionCall %[[I64]] %[[READCYCLECOUNTER]]
+; CHECK:     OpFunctionCall %[[I64]] %[[SOME_CUSTOM_INTRINSIC]]
+
+define spir_func void @foo() {
+entry:
+; TODO: if and when the SPIR-V learns how to lower readcyclecounter, we will have to pick another unhandled intrinsic
+  %0 = call i64 @llvm.readcyclecounter()
+  %1 = call i64 @llvm.some.custom.intrinsic()
+  ret void
+}
+
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.some.custom.intrinsic()
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
new file mode 100644
index 0000000..f6b6115
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_blocking_pipes/PipeBlocking.ll
@@ -0,0 +1,98 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_blocking_pipes %s -o - -filetype=obj | spirv-val %}
+
+%opencl.pipe_ro_t = type opaque
+%opencl.pipe_wo_t = type opaque
+
+; CHECK-SPIRV: OpCapability BlockingPipesALTERA
+; CHECK-SPIRV: OpExtension "SPV_ALTERA_blocking_pipes"
+; CHECK-SPIRV: %[[PipeRTy:[0-9]+]] = OpTypePipe ReadOnly
+; CHECK-SPIRV: %[[PipeWTy:[0-9]+]] = OpTypePipe WriteOnly
+; CHECK-SPIRV: %[[PipeR1:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR1]] %[[#]] %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeR2:[0-9]+]] = OpLoad %[[PipeRTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpReadPipeBlockingALTERA %[[PipeR2]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW1:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW1]] %[[#]]  %[[#]] %[[#]]
+; CHECK-SPIRV: %[[PipeW2:[0-9]+]] = OpLoad %[[PipeWTy]] %[[#]] Aligned 8
+; CHECK-SPIRV: OpWritePipeBlockingALTERA %[[PipeW2]] %[[#]] %[[#]] %[[#]]
+
+define spir_func void @foo(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePiii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @bar(target("spirv.Pipe", 0) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 0), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 0) %p, target("spirv.Pipe", 0)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 0), target("spirv.Pipe", 0)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z29__spirv_ReadPipeBlockingINTELIiEv8ocl_pipePvii(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+
+define spir_func void @boo(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePiii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+define spir_func void @baz(target("spirv.Pipe", 1) %p, ptr addrspace(1) %ptr) {
+entry:
+  %p.addr = alloca target("spirv.Pipe", 1), align 8
+  %ptr.addr = alloca ptr addrspace(1), align 8
+  store target("spirv.Pipe", 1) %p, target("spirv.Pipe", 1)* %p.addr, align 8
+  store ptr addrspace(1) %ptr, ptr %ptr.addr, align 8
+  %0 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1)* %p.addr, align 8
+  %1 = load ptr addrspace(1), ptr %ptr.addr, align 8
+  %2 = addrspacecast ptr addrspace(1) %1 to ptr addrspace(4)
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1) %0, ptr addrspace(4) %2, i32 4, i32 4)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIKiEv8ocl_pipePvii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+
+; CHECK-LLVM: declare spir_func void @__read_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+; CHECK-LLVM: declare spir_func void @__write_pipe_2_bl(ptr addrspace(1), ptr addrspace(4), i32, i32)
+
+define linkonce_odr dso_local spir_func void @WritePipeBLockingi9Pointer(ptr addrspace(4) align 2 dereferenceable(2) %_Data) {
+entry:
+  %_Data.addr = alloca ptr addrspace(4), align 8
+  %_WPipe = alloca target("spirv.Pipe", 1), align 8
+  %_Data.addr.ascast = addrspacecast ptr %_Data.addr to ptr addrspace(4)
+  %_WPipe.ascast = addrspacecast target("spirv.Pipe", 1)* %_WPipe to target("spirv.Pipe", 1) addrspace(4)*
+  store ptr addrspace(4) %_Data, ptr addrspace(4) %_Data.addr.ascast, align 8
+  %0 = bitcast target("spirv.Pipe", 1)* %_WPipe to ptr
+  %1 = load target("spirv.Pipe", 1), target("spirv.Pipe", 1) addrspace(4)* %_WPipe.ascast, align 8
+  %2 = load ptr addrspace(4), ptr addrspace(4) %_Data.addr.ascast, align 8
+  call spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1) %1, ptr addrspace(4) %2, i32 2, i32 2)
+  ret void
+}
+
+declare dso_local spir_func void @_Z30__spirv_WritePipeBlockingINTELIDU9_Ev8ocl_pipePKT_ii(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+ 
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index a8d37be..c44b3bb 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -2808,6 +2808,348 @@ entry:
   ret <4 x i32> %spec.store.select7
 }
 
+define <2 x i8> @fptosi_v2f32_v2i8(<2 x float> %x) {
+; CHECK-LABEL: fptosi_v2f32_v2i8:
+; CHECK:         .functype fptosi_v2f32_v2i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <2 x float> %x to <2 x i8>
+  ret <2 x i8> %conv
+}
+
+define <2 x i8> @fptoui_v2f32_v2i8(<2 x float> %x) {
+; CHECK-LABEL: fptoui_v2f32_v2i8:
+; CHECK:         .functype fptoui_v2f32_v2i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <2 x float> %x to <2 x i8>
+  ret <2 x i8> %conv
+}
+
+define <2 x i16> @fptosi_v2f32_v2i16(<2 x float> %x) {
+; CHECK-LABEL: fptosi_v2f32_v2i16:
+; CHECK:         .functype fptosi_v2f32_v2i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <2 x float> %x to <2 x i16>
+  ret <2 x i16> %conv
+}
+
+define <2 x i16> @fptoui_v2f32_v2i16(<2 x float> %x) {
+; CHECK-LABEL: fptoui_v2f32_v2i16:
+; CHECK:         .functype fptoui_v2f32_v2i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <2 x float> %x to <2 x i16>
+  ret <2 x i16> %conv
+}
+
+define <4 x i8> @fptosi_v4f32_v4i8(<4 x float> %x) {
+; CHECK-LABEL: fptosi_v4f32_v4i8:
+; CHECK:         .functype fptosi_v4f32_v4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <4 x float> %x to <4 x i8>
+  ret <4 x i8> %conv
+}
+
+define <4 x i8> @fptoui_v4f32_v4i8(<4 x float> %x) {
+; CHECK-LABEL: fptoui_v4f32_v4i8:
+; CHECK:         .functype fptoui_v4f32_v4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <4 x float> %x to <4 x i8>
+  ret <4 x i8> %conv
+}
+
+define <4 x i16> @fptosi_v4f32_v4i16(<4 x float> %x) {
+; CHECK-LABEL: fptosi_v4f32_v4i16:
+; CHECK:         .functype fptosi_v4f32_v4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <4 x float> %x to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <4 x i16> @fptoui_v4f32_v4i16(<4 x float> %x) {
+; CHECK-LABEL: fptoui_v4f32_v4i16:
+; CHECK:         .functype fptoui_v4f32_v4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <4 x float> %x to <4 x i16>
+  ret <4 x i16> %conv
+}
+
+define <8 x i8> @fptosi_v8f32_v8i8(<8 x float> %x) {
+; CHECK-LABEL: fptosi_v8f32_v8i8:
+; CHECK:         .functype fptosi_v8f32_v8i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <8 x float> %x to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i8> @fptoui_v8f32_v8i8(<8 x float> %x) {
+; CHECK-LABEL: fptoui_v8f32_v8i8:
+; CHECK:         .functype fptoui_v8f32_v8i8 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <8 x float> %x to <8 x i8>
+  ret <8 x i8> %conv
+}
+
+define <8 x i16> @fptosi_v8f32_v8i16(<8 x float> %x) {
+; CHECK-LABEL: fptosi_v8f32_v8i16:
+; CHECK:         .functype fptosi_v8f32_v8i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <8 x float> %x to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <8 x i16> @fptoui_v8f32_v8i16(<8 x float> %x) {
+; CHECK-LABEL: fptoui_v8f32_v8i16:
+; CHECK:         .functype fptoui_v8f32_v8i16 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <8 x float> %x to <8 x i16>
+  ret <8 x i16> %conv
+}
+
+define <16 x i8> @fptosi_v16f32_v16i8(<16 x float> %x) {
+; CHECK-LABEL: fptosi_v16f32_v16i8:
+; CHECK:         .functype fptosi_v16f32_v16i8 (v128, v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <16 x float> %x to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i8> @fptoui_v16f32_v16i8(<16 x float> %x) {
+; CHECK-LABEL: fptoui_v16f32_v16i8:
+; CHECK:         .functype fptoui_v16f32_v16i8 (v128, v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 255, 255, 255, 255
+; CHECK-NEXT:    local.tee 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <16 x float> %x to <16 x i8>
+  ret <16 x i8> %conv
+}
+
+define <16 x i16> @fptosi_v16f32_v16i16(<16 x float> %x) {
+; CHECK-LABEL: fptosi_v16f32_v16i16:
+; CHECK:         .functype fptosi_v16f32_v16i16 (i32, v128, v128, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi <16 x float> %x to <16 x i16>
+  ret <16 x i16> %conv
+}
+
+define <16 x i16> @fptoui_v16f32_v16i16(<16 x float> %x) {
+; CHECK-LABEL: fptoui_v16f32_v16i16:
+; CHECK:         .functype fptoui_v16f32_v16i16 (i32, v128, v128, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptoui <16 x float> %x to <16 x i16>
+  ret <16 x i16> %conv
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 5eb49fd..404db23 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
 
@@ -20,17 +20,17 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -64,17 +64,17 @@ define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0,
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: i32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -208,27 +208,27 @@ define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i16x8.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -276,27 +276,27 @@ define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -343,27 +343,27 @@ define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK-LABEL: four_shorts_interleave_op:
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.or
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: v128.xor
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 4, 5, 20, 21, 0, 1, 0, 1, 6, 7, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 0, 1, 0, 1, 16, 17, 0, 1, 0, 1, 2, 3, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -483,19 +483,19 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 ; CHECK: i16x8.extmul_low_i8x16_u
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -529,18 +529,18 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; CHECK: i16x8.extmul_high_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31
+; CHECK: i8x16.shuffle  0, 24, 2, 25, 4, 26, 6, 27, 8, 28, 10, 29, 12, 30, 14, 31
 ; CHECK: v128.store
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23
+; CHECK: i8x16.shuffle  0, 16, 2, 17, 4, 18, 6, 19, 8, 20, 10, 21, 12, 22, 14, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -672,27 +672,27 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
+; CHECK: i8x16.shuffle  0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -740,25 +740,25 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}}, 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extmul_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}}, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}}, 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}}, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}}, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}}, 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27
+; CHECK: i8x16.shuffle  0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27
 ; CHECK: v128.store
 define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -806,27 +806,27 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.add
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 0, 4, 20, 0, 0, 5, 21, 0, 0, 6, 22, 0, 0, 7, 23
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
+; CHECK: i8x16.shuffle  0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19, 0, 0
+; CHECK: i8x16.shuffle  0, 0, 0, 16, 0, 0, 1, 17, 0, 0, 2, 18, 0, 0, 3, 19
+; CHECK: i8x16.shuffle  0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1272,45 +1272,45 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun
 ; CHECK-LABEL: four_bytes_into_four_ints_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: i32x4.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
 ; CHECK: i32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extmul_low_i16x8_u
 ; CHECK: v128.and
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i32x4.extend_low_i16x8_u
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -1365,7 +1365,7 @@ define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noun
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1396,35 +1396,35 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: i16x8.add
 ; CHECK: i16x8.shr_u
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_u
 ; CHECK: i16x8.add
 ; CHECK: i16x8.add
 ; CHECK: i16x8.shr_u
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+; CHECK: i8x16.shuffle  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1492,13 +1492,13 @@ define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i3
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.avgr_u 
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i8x16.avgr_u 
-; CHECK: i8x16.shuffle {{.*}} 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
+; CHECK: i8x16.shuffle  0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
 ; CHECK: v128.store
 define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
   %5 = icmp sgt i32 %3, 0
@@ -1605,28 +1605,28 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_bytes_two_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s	
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1663,28 +1663,28 @@ for.body:                                         ; preds = %entry, %for.body
 
 ; CHECK-LABEL: two_bytes_two_floats_vary_op:
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load64_zero
-; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1723,38 +1723,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.store64_lane
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1791,38 +1777,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: v128.store64_lane
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1858,24 +1830,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_shorts_two_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1913,24 +1885,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: two_shorts_two_floats_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
 ; CHECK: v128.store
 define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1969,38 +1941,22 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
 ; CHECK: v128.store
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2037,38 +1993,22 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
 ; CHECK: v128.store
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2195,58 +2135,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: four_bytes_four_floats_same_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2302,58 +2242,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: four_bytes_four_floats_vary_op:
 ; CHECK: loop
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.div
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle  3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK: i16x8.extend_low_i8x16_s
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2410,88 +2350,60 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
 ; CHECK: v128.store
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2544,88 +2456,60 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	255, 255, 255, 255
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.narrow_i16x8_u
+; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
 ; CHECK: v128.store
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2678,51 +2562,51 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2779,47 +2663,47 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.mul
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.add
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.div
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
-; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i8x16.shuffle  6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
 ; CHECK: i32x4.extend_low_i16x8_s
 ; CHECK: f32x4.convert_i32x4_s
 ; CHECK: f32x4.sub
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle  4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.store
 define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2876,89 +2760,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
 ; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
 ; CHECK: v128.store
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -3011,89 +2864,58 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: loop
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
 ; CHECK: v128.load
 ; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.const	65535, 65535, 65535, 65535
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
 ; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i32x4.trunc_sat_f32x4_s
+; CHECK: v128.and
+; CHECK: i16x8.narrow_i32x4_u
+; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
 ; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
 ; CHECK: v128.store
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
new file mode 100644
index 0000000..21b25d8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA
+
+; Ensure reloads are after narrowed i512 -> i32 store
+define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
+; POSTRA-LABEL: PR166744:
+; POSTRA:       # %bb.0:
+; POSTRA-NEXT:    movl $1029, %eax # imm = 0x405
+; POSTRA-NEXT:    shlxl %esi, %edx, %edx
+; POSTRA-NEXT:    bextrl %eax, %esi, %eax
+; POSTRA-NEXT:    movl (%rdi,%rax,4), %ecx
+; POSTRA-NEXT:    btrl %esi, %ecx
+; POSTRA-NEXT:    orl %ecx, %edx
+; POSTRA-NEXT:    movl %edx, (%rdi,%rax,4)
+; POSTRA-NEXT:    movq 16(%rdi), %rax
+; POSTRA-NEXT:    movq (%rdi), %rcx
+; POSTRA-NEXT:    movq 24(%rdi), %rdx
+; POSTRA-NEXT:    movq 8(%rdi), %rsi
+; POSTRA-NEXT:    orq 56(%rdi), %rdx
+; POSTRA-NEXT:    orq 40(%rdi), %rsi
+; POSTRA-NEXT:    orq 48(%rdi), %rax
+; POSTRA-NEXT:    orq 32(%rdi), %rcx
+; POSTRA-NEXT:    orq %rdx, %rsi
+; POSTRA-NEXT:    orq %rax, %rcx
+; POSTRA-NEXT:    orq %rsi, %rcx
+; POSTRA-NEXT:    setne %al
+; POSTRA-NEXT:    retq
+;
+; NOPOSTRA-LABEL: PR166744:
+; NOPOSTRA:       # %bb.0:
+; NOPOSTRA-NEXT:    movl %esi, %eax
+; NOPOSTRA-NEXT:    shrl $3, %eax
+; NOPOSTRA-NEXT:    andl $60, %eax
+; NOPOSTRA-NEXT:    movl (%rdi,%rax), %ecx
+; NOPOSTRA-NEXT:    btrl %esi, %ecx
+; NOPOSTRA-NEXT:    shlxl %esi, %edx, %edx
+; NOPOSTRA-NEXT:    orl %ecx, %edx
+; NOPOSTRA-NEXT:    movl %edx, (%rdi,%rax)
+; NOPOSTRA-NEXT:    movq 16(%rdi), %rax
+; NOPOSTRA-NEXT:    movq (%rdi), %rcx
+; NOPOSTRA-NEXT:    movq 8(%rdi), %rdx
+; NOPOSTRA-NEXT:    movq 24(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 56(%rdi), %rsi
+; NOPOSTRA-NEXT:    orq 40(%rdi), %rdx
+; NOPOSTRA-NEXT:    orq 48(%rdi), %rax
+; NOPOSTRA-NEXT:    orq 32(%rdi), %rcx
+; NOPOSTRA-NEXT:    orq %rsi, %rdx
+; NOPOSTRA-NEXT:    orq %rax, %rcx
+; NOPOSTRA-NEXT:    orq %rdx, %rcx
+; NOPOSTRA-NEXT:    setne %al
+; NOPOSTRA-NEXT:    retq
+  %rem = and i64 %idx, 511
+  %sh_prom = zext nneg i64 %rem to i512
+  %shl = shl nuw i512 1, %sh_prom
+  %not = xor i512 %shl, -1
+  %load = load i512, ptr %v, align 8
+  %and = and i512 %load, %not
+  %conv2 = zext i1 %b to i512
+  %shl4 = shl nuw i512 %conv2, %sh_prom
+  %or = or i512 %and, %shl4
+  store i512 %or, ptr %v, align 8
+  %cmp = icmp ne i512 %or, 0
+  ret i1 %cmp
+}
diff --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s
index 78e4f86..ff0dfb3 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_err.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_err.s
@@ -674,46 +674,3 @@ v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 // GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 
-// nv bit in FLAT instructions
-flat_load_ubyte v5, v[2:3] offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_load_ubyte a5, v[2:3] offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_store_dword v[2:3], v5 offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_store_dword v[2:3], a5 offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_load_ubyte v5, v[2:3], off offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_store_byte v[2:3], v5, off offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_add v[2:3], v5, off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap a1, v[2:3], a2, off glc nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap_x2 v[2:3], v[4:5], off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap_x2 v[2:3], a[4:5], off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_load_ubyte v5, off, s2 offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_load_ubyte a5, off, s2 offset:-1 nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_store_dword v2, v3, off nv
-// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
index 3af0d83..c96a72d 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -706,107 +706,107 @@ flat_load_short_d16_hi a5, v[2:3] offset:4095 glc
 flat_load_short_d16_hi a5, v[2:3] offset:4095 slc
 
 // GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_swap a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_add a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_sub a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_and a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_or a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_xor a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_inc a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_dec a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc
 
 // GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
 flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx942_err.s b/llvm/test/MC/AMDGPU/gfx942_err.s
index dc51bab..fd59a01 100644
--- a/llvm/test/MC/AMDGPU/gfx942_err.s
+++ b/llvm/test/MC/AMDGPU/gfx942_err.s
@@ -125,31 +125,3 @@ global_load_dword v[2:3], off lds
 
 scratch_load_dword v2, off lds
 // GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-// nv bit in FLAT instructions
-flat_load_ubyte v5, v[2:3] offset:4095 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_store_dword v[2:3], v5 offset:4095 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-flat_atomic_add_f32 v[2:3], v5 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_load_dword v2, v[2:3], off sc0 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_store_dword v[2:3], v5 off sc0 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_add_f64 v[0:1], v[2:3], off sc1 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-global_atomic_swap v0, v[2:3], v5 off sc0 nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_load_lds_dword v2, off nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
-
-scratch_store_dword v2, v3, off nv
-// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
index 7687c0a..5cc3d25 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
@@ -24,18 +24,6 @@ flat_load_ubyte v5, v[1:2] offset:4095 glc
 flat_load_ubyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sbyte v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 
@@ -60,18 +48,6 @@ flat_load_sbyte v5, v[1:2] offset:4095 glc
 flat_load_sbyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_ushort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 
@@ -96,18 +72,6 @@ flat_load_ushort v5, v[1:2] offset:4095 glc
 flat_load_ushort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ushort v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ushort v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ushort v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ushort v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sshort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -132,18 +96,6 @@ flat_load_sshort v5, v[1:2] offset:4095 glc
 flat_load_sshort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sshort v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sshort v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sshort v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sshort v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dword v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 
@@ -168,18 +120,6 @@ flat_load_dword v5, v[1:2] offset:4095 glc
 flat_load_dword v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dword v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dword v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dword v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dword v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 
@@ -204,18 +144,6 @@ flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[5:6], v[1:2] nv
-// CHECK: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 
@@ -240,18 +168,6 @@ flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[5:7], v[1:2] nv
-// CHECK: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -276,18 +192,6 @@ flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[5:8], v[1:2] nv
-// CHECK: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
-
 flat_store_byte v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 
@@ -312,18 +216,6 @@ flat_store_byte v[1:2], v2 offset:4095 glc
 flat_store_byte v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_byte v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_byte_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 
@@ -348,18 +240,6 @@ flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_byte_d16_hi v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte_d16_hi v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_short v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 
@@ -384,18 +264,6 @@ flat_store_short v[1:2], v2 offset:4095 glc
 flat_store_short v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_short v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_short_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -420,18 +288,6 @@ flat_store_short_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_short_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_short_d16_hi v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short_d16_hi v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dword v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 
@@ -456,18 +312,6 @@ flat_store_dword v[1:2], v2 offset:4095 glc
 flat_store_dword v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dword v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dword v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dword v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dword v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 
@@ -492,18 +336,6 @@ flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx2 v[1:2], v[2:3] nv
-// CHECK: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv
-// CHECK: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095
 // CHECK: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 
@@ -528,18 +360,6 @@ flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[1:2], v[2:4] nv
-// CHECK: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv
-// CHECK: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
-
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095
 // CHECK: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -564,18 +384,6 @@ flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[1:2], v[2:5] nv
-// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv
-// CHECK: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
-
-flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
-
 flat_load_ubyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 
@@ -600,18 +408,6 @@ flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte_d16 v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16 v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 
@@ -636,18 +432,6 @@ flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte_d16_hi v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sbyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 
@@ -672,18 +456,6 @@ flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte_d16 v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16 v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -708,18 +480,6 @@ flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte_d16_hi v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_short_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 
@@ -744,18 +504,6 @@ flat_load_short_d16 v5, v[1:2] offset:4095 glc
 flat_load_short_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_short_d16 v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16 v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16 v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16 v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
-
 flat_load_short_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 
@@ -780,18 +528,6 @@ flat_load_short_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_short_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_short_d16_hi v5, v[1:2] nv
-// CHECK: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16_hi v5, v[1:2] offset:7 nv
-// CHECK: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
-
-flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
-
 flat_atomic_swap v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 
@@ -816,18 +552,6 @@ flat_atomic_swap v0, v[1:2], v2 offset:4095 glc
 flat_atomic_swap v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_swap v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_swap v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
-
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 
@@ -852,18 +576,6 @@ flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[1:2], v[2:3] nv
-// CHECK: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv
-// CHECK: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
-
 flat_atomic_add v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 
@@ -888,18 +600,6 @@ flat_atomic_add v0, v[1:2], v2 offset:4095 glc
 flat_atomic_add v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v[1:2], v2 nv
-// CHECK: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_add v[1:2], v2 offset:7 nv
-// CHECK: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv
-// CHECK: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
-
-flat_atomic_add v[1:2], v2 offset:4095 slc nv
-// CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
-
 flat_atomic_sub v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 
@@ -1497,18 +1197,6 @@ global_load_ubyte v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ubyte v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sbyte v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1554,18 +1242,6 @@ global_load_sbyte v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sbyte v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_ushort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1611,18 +1287,6 @@ global_load_ushort v5, v1, s[4:5] offset:-1 glc
 global_load_ushort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ushort v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ushort v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ushort v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ushort v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sshort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1668,18 +1332,6 @@ global_load_sshort v5, v1, s[4:5] offset:-1 glc
 global_load_sshort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sshort v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sshort v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sshort v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sshort v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dword v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1725,18 +1377,6 @@ global_load_dword v5, v1, s[4:5] offset:-1 glc
 global_load_dword v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dword v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dword v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dword v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dword v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1782,18 +1422,6 @@ global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dwordx2 v[5:6], v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1839,15 +1467,6 @@ global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dwordx3 v[5:7], v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1893,15 +1512,6 @@ global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_dwordx4 v[5:8], v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
-global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
-
 global_store_byte v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1947,18 +1557,6 @@ global_store_byte v1, v2, s[6:7] offset:-1 glc
 global_store_byte v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_byte v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2004,18 +1602,6 @@ global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_byte_d16_hi v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_short v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2061,18 +1647,6 @@ global_store_short v1, v2, s[6:7] offset:-1 glc
 global_store_short v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_short v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2118,18 +1692,6 @@ global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_short_d16_hi v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dword v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2175,18 +1737,6 @@ global_store_dword v1, v2, s[6:7] offset:-1 glc
 global_store_dword v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dword v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dword v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dword v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dword v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2232,18 +1782,6 @@ global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dwordx2 v1, v[2:3], s[6:7] nv
-// CHECK: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2289,18 +1827,6 @@ global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dwordx3 v1, v[2:4], s[6:7] nv
-// CHECK: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
-
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -2346,18 +1872,6 @@ global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x06,0x00]
 
-global_store_dwordx4 v1, v[2:5], s[6:7] nv
-// CHECK: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
-
-global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
-
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2403,18 +1917,6 @@ global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ubyte_d16 v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2460,18 +1962,6 @@ global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_ubyte_d16_hi v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2517,18 +2007,6 @@ global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sbyte_d16 v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2574,18 +2052,6 @@ global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_sbyte_d16_hi v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_short_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2631,18 +2097,6 @@ global_load_short_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_short_d16 v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16 v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
-
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2688,18 +2142,6 @@ global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x04,0x05]
 
-global_load_short_d16_hi v5, v1, s[4:5] nv
-// CHECK: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv
-// CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
-
-global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
-
 global_atomic_swap v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2745,18 +2187,6 @@ global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_swap v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x06,0x00]
 
-global_atomic_swap v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_swap v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
-
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2802,18 +2232,6 @@ global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x06,0x00]
 
-global_atomic_cmpswap v1, v[2:3], s[6:7] nv
-// CHECK: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
-
 global_atomic_add v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2859,18 +2277,6 @@ global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_add v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x06,0x00]
 
-global_atomic_add v1, v2, s[6:7] nv
-// CHECK: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_add v1, v2, s[6:7] offset:-1 nv
-// CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv
-// CHECK: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
-
-global_atomic_add v1, v2, s[6:7] offset:-1 slc nv
-// CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
-
 global_atomic_sub v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x06,0x00]
 
@@ -3951,18 +3357,6 @@ scratch_load_ubyte v5, off, s2 offset:-1 glc
 scratch_load_ubyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ubyte v5, off, s2 nv
-// CHECK: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sbyte v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4008,18 +3402,6 @@ scratch_load_sbyte v5, off, s2 offset:-1 glc
 scratch_load_sbyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sbyte v5, off, s2 nv
-// CHECK: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_ushort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4065,18 +3447,6 @@ scratch_load_ushort v5, off, s2 offset:-1 glc
 scratch_load_ushort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ushort v5, off, s2 nv
-// CHECK: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ushort v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ushort v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ushort v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sshort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4122,18 +3492,6 @@ scratch_load_sshort v5, off, s2 offset:-1 glc
 scratch_load_sshort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sshort v5, off, s2 nv
-// CHECK: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sshort v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sshort v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sshort v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dword v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4179,18 +3537,6 @@ scratch_load_dword v5, off, s2 offset:-1 glc
 scratch_load_dword v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dword v5, off, s2 nv
-// CHECK: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dword v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dword v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dword v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4236,18 +3582,6 @@ scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dwordx2 v[5:6], off, s2 nv
-// CHECK: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4293,18 +3627,6 @@ scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dwordx3 v[5:7], off, s2 nv
-// CHECK: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4350,18 +3672,6 @@ scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_dwordx4 v[5:8], off, s2 nv
-// CHECK: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_store_byte off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4407,18 +3717,6 @@ scratch_store_byte off, v2, s3 offset:-1 glc
 scratch_store_byte off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_byte off, v2, s3 nv
-// CHECK: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_byte_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4464,18 +3762,6 @@ scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_byte_d16_hi off, v2, s3 nv
-// CHECK: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_short off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4521,18 +3807,6 @@ scratch_store_short off, v2, s3 offset:-1 glc
 scratch_store_short off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_short off, v2, s3 nv
-// CHECK: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_short_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4578,18 +3852,6 @@ scratch_store_short_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_short_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_short_d16_hi off, v2, s3 nv
-// CHECK: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short_d16_hi off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dword off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4635,18 +3897,6 @@ scratch_store_dword off, v2, s3 offset:-1 glc
 scratch_store_dword off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dword off, v2, s3 nv
-// CHECK: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dword off, v2, s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dword off, v2, s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dword off, v2, s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1
 // CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4692,18 +3942,6 @@ scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dwordx2 off, v[2:3], s3 nv
-// CHECK: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1
 // CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4749,18 +3987,6 @@ scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dwordx3 off, v[2:4], s3 nv
-// CHECK: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1
 // CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4806,18 +4032,6 @@ scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 
-scratch_store_dwordx4 off, v[2:5], s3 nv
-// CHECK: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv
-// CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
-
-scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
-
 scratch_load_ubyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4863,18 +4077,6 @@ scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ubyte_d16 v5, off, s2 nv
-// CHECK: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4920,18 +4122,6 @@ scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_ubyte_d16_hi v5, off, s2 nv
-// CHECK: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sbyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4977,18 +4167,6 @@ scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sbyte_d16 v5, off, s2 nv
-// CHECK: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -5034,18 +4212,6 @@ scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_sbyte_d16_hi v5, off, s2 nv
-// CHECK: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_short_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 
@@ -5088,18 +4254,6 @@ scratch_load_short_d16 v5, off, s2 offset:-4096
 scratch_load_short_d16 v5, off, s2 offset:-1 glc
 // CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_short_d16 v5, off, s2 nv
-// CHECK: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16 v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16 v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16 v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
-
 scratch_load_short_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 
@@ -5148,18 +4302,6 @@ scratch_load_short_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_short_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 
-scratch_load_short_d16_hi v5, off, s2 nv
-// CHECK: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16_hi v5, off, s2 offset:-1 nv
-// CHECK: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv
-// CHECK: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
-
-scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv
-// CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
-
 global_load_dword v[2:3], off lds
 // CHECK: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
index 4c06585..0ee659e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
@@ -21,18 +21,6 @@
 # CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ubyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05
 
@@ -54,18 +42,6 @@
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sbyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05
 
@@ -87,18 +63,6 @@
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ushort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ushort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ushort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05
 
@@ -120,18 +84,6 @@
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sshort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sshort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sshort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dword v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05
 
@@ -153,18 +105,6 @@
 # CHECK: flat_load_dword v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dword v5, v[1:2] nv           ; encoding: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dword v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dword v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dword v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05
 
@@ -186,18 +126,6 @@
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] nv     ; encoding: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05
 
@@ -219,18 +147,6 @@
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] nv     ; encoding: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05
 
@@ -252,18 +168,6 @@
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] nv     ; encoding: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_store_byte v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00
 
@@ -285,18 +189,6 @@
 # CHECK: flat_store_byte v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_byte v[1:2], v2 nv           ; encoding: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00
 
@@ -318,18 +210,6 @@
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 nv    ; encoding: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_short v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00
 
@@ -351,18 +231,6 @@
 # CHECK: flat_store_short v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_short v[1:2], v2 nv          ; encoding: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00
 
@@ -384,18 +252,6 @@
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_short_d16_hi v[1:2], v2 nv   ; encoding: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00
 
@@ -417,18 +273,6 @@
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dword v[1:2], v2 nv          ; encoding: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dword v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dword v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dword v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00
 
@@ -450,18 +294,6 @@
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] nv    ; encoding: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00
 
@@ -483,18 +315,6 @@
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] nv    ; encoding: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv ; encoding: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv ; encoding: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00
 
@@ -516,18 +336,6 @@
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00
 
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] nv    ; encoding: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv ; encoding: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
-0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv ; encoding: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00
-
-# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00
-
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05
 
@@ -549,18 +357,6 @@
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05
 
@@ -582,18 +378,6 @@
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05
 
@@ -615,18 +399,6 @@
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05
 
@@ -648,18 +420,6 @@
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05
 
@@ -681,18 +441,6 @@
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_short_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05
 
@@ -714,18 +462,6 @@
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05
 
-# CHECK: flat_load_short_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
-0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05
-
-# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
-0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05
-
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00
 
@@ -747,18 +483,6 @@
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00
 
-# CHECK: flat_atomic_swap v[1:2], v2 nv          ; encoding: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_swap v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
-0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00
-
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00
 
@@ -780,18 +504,6 @@
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00
 
-# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] nv   ; encoding: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
-0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00
-
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00
 
@@ -813,18 +525,6 @@
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00
 
-# CHECK: flat_atomic_add v[1:2], v2 nv           ; encoding: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_add v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
-0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00
-
-# CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
-0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00
-
 # CHECK: flat_atomic_sub v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00
 
@@ -1317,18 +1017,6 @@
 # CHECK: global_load_ubyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ubyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sbyte v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1338,18 +1026,6 @@
 # CHECK: global_load_sbyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sbyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_ushort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1359,18 +1035,6 @@
 # CHECK: global_load_ushort v5, v[1:2], off      ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ushort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sshort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1380,18 +1044,6 @@
 # CHECK: global_load_sshort v5, v[1:2], off      ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sshort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dword v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1401,18 +1053,6 @@
 # CHECK: global_load_dword v5, v[1:2], off       ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dword v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1422,18 +1062,6 @@
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] nv ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1443,18 +1071,6 @@
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] nv ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1464,18 +1080,6 @@
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] nv ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_store_byte v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1485,18 +1089,6 @@
 # CHECK: global_store_byte v[1:2], v2, off       ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_byte v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1506,18 +1098,6 @@
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_short v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1527,18 +1107,6 @@
 # CHECK: global_store_short v[1:2], v2, off      ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_short v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_short_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1548,18 +1116,6 @@
 # CHECK: global_store_short_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dword v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1569,18 +1125,6 @@
 # CHECK: global_store_dword v[1:2], v2, off      ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dword v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1590,18 +1134,6 @@
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1611,18 +1143,6 @@
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] nv ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1632,18 +1152,6 @@
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] nv ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
-0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00
-
-# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00
-
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1653,18 +1161,6 @@
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1674,18 +1170,6 @@
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1695,18 +1179,6 @@
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1716,18 +1188,6 @@
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_short_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1737,18 +1197,6 @@
 # CHECK: global_load_short_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_short_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_load_short_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1758,18 +1206,6 @@
 # CHECK: global_load_short_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05
 
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
-0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05
-
-# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
-0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05
-
 # CHECK: global_atomic_swap v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1779,18 +1215,6 @@
 # CHECK: global_atomic_swap v[1:2], v2, off      ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00
 
-# CHECK: global_atomic_swap v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
-0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00
-
 # CHECK: global_atomic_cmpswap v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1812,18 +1236,6 @@
 # CHECK: global_atomic_cmpswap v1, v[2:3], v[4:5], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01]
 0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01
 
-# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
-0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00
-
 # CHECK: global_atomic_add v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1833,18 +1245,6 @@
 # CHECK: global_atomic_add v[1:2], v2, off       ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00
 
-# CHECK: global_atomic_add v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
-0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00
-
-# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
-0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00
-
 # CHECK: global_atomic_sub v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00
 
@@ -2103,18 +1503,6 @@
 # CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ubyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05
 
@@ -2154,18 +1542,6 @@
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sbyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05
 
@@ -2205,18 +1581,6 @@
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ushort v5, off, s2 nv      ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ushort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ushort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05
 
@@ -2256,18 +1620,6 @@
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sshort v5, off, s2 nv      ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sshort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sshort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05
 
@@ -2307,18 +1659,6 @@
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dword v5, off, s2 nv       ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dword v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dword v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dword v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05
 
@@ -2358,18 +1698,6 @@
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 nv ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05
 
@@ -2409,18 +1737,6 @@
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 nv ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05
 
@@ -2460,18 +1776,6 @@
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 nv ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00
 
@@ -2511,18 +1815,6 @@
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_byte off, v2, s3 nv       ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00
 
@@ -2562,18 +1854,6 @@
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_short off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00
 
@@ -2613,18 +1893,6 @@
 # CHECK: scratch_store_short off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_short off, v2, s3 nv      ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00
 
@@ -2664,18 +1932,6 @@
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_short_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00
 
@@ -2715,18 +1971,6 @@
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dword off, v2, s3 nv      ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dword off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dword off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dword off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00
 
@@ -2766,18 +2010,6 @@
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 nv ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00
 
@@ -2817,18 +2049,6 @@
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 nv ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00
 
@@ -2868,18 +2088,6 @@
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00
 
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 nv ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
-0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00
-
-# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
-0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00
-
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05
 
@@ -2919,18 +2127,6 @@
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05
 
@@ -2970,18 +2166,6 @@
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05
 
@@ -3021,18 +2205,6 @@
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05
 
@@ -3072,18 +2244,6 @@
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05
 
@@ -3123,18 +2283,6 @@
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_short_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05
 
@@ -3174,18 +2322,6 @@
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05
 
-# CHECK: scratch_load_short_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
-0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05
-
-# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
-0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05
-
 # CHECK: global_load_dword v[2:3], off lds       ; encoding: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
deleted file mode 100644
index bd5f4e2..0000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16  -S < %s | FileCheck %s
-
-define dso_local void @_Z6unpackPhS_(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) {
-; CHECK-LABEL: define dso_local void @_Z6unpackPhS_(
-; CHECK-SAME: ptr noalias noundef readonly captures(none) [[IN:%.*]], ptr noalias noundef writeonly captures(none) [[OUT:%.*]]) {
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[IN]], i64 [[OFFSET_IDX2]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP3]], align 1, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC4]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i8> [[STRIDED_VEC4]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> zeroinitializer, <4 x i8> [[STRIDED_VEC6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC5]], <4 x i8> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC4]], <4 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC]], <4 x i8> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP20]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP21]], <16 x i8> [[TMP22]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> poison, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
-; CHECK-NEXT:    store <64 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %vector.body, !llvm.loop [[LOOP5:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ]
-  %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ]
-  %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ]
-  store i8 0, ptr %out.addr.032, align 1
-  %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3
-  %0 = load i8, ptr %arrayidx10, align 1
-  %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1
-  store i8 %0, ptr %arrayidx14, align 1
-  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2
-  %1 = load i8, ptr %arrayidx10.1, align 1
-  %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2
-  store i8 %1, ptr %arrayidx14.1, align 1
-  %add.2 = add i8 %0, %1
-  %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3
-  store i8 %add.2, ptr %arrayidx14.2, align 1
-  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1
-  %2 = load i8, ptr %arrayidx10.3, align 1
-  %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4
-  store i8 %2, ptr %arrayidx14.3, align 1
-  %add.4 = add i8 %0, %2
-  %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5
-  store i8 %add.4, ptr %arrayidx14.4, align 1
-  %add.5 = add i8 %1, %2
-  %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6
-  store i8 %add.5, ptr %arrayidx14.5, align 1
-  %add.6 = add i8 %0, %add.5
-  %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7
-  store i8 %add.6, ptr %arrayidx14.6, align 1
-  %3 = load i8, ptr %in.addr.031, align 1
-  %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8
-  store i8 %3, ptr %arrayidx14.7, align 1
-  %add.8 = add i8 %0, %3
-  %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9
-  store i8 %add.8, ptr %arrayidx14.8, align 1
-  %add.9 = add i8 %1, %3
-  %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10
-  store i8 %add.9, ptr %arrayidx14.9, align 1
-  %add.10 = add i8 %0, %add.9
-  %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11
-  store i8 %add.10, ptr %arrayidx14.10, align 1
-  %add.11 = add i8 %2, %3
-  %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12
-  store i8 %add.11, ptr %arrayidx14.11, align 1
-  %add.12 = add i8 %0, %add.11
-  %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13
-  store i8 %add.12, ptr %arrayidx14.12, align 1
-  %add.13 = add i8 %1, %add.11
-  %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14
-  store i8 %add.13, ptr %arrayidx14.13, align 1
-  %add.14 = add i8 %0, %add.13
-  %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15
-  store i8 %add.14, ptr %arrayidx14.14, align 1
-  %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16
-  %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4
-  %inc17 = add nuw nsw i32 %i.033, 1
-  %exitcond.not = icmp eq i32 %inc17, 32
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
-}
-
-!0 = distinct !{!0, !1}
-!1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll
new file mode 100644
index 0000000..fe7f43f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked_ldst_sme.ll
@@ -0,0 +1,187 @@
+; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @wombat(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i8 %arg6) #0 {
+; CHECK-LABEL: define void @wombat(
+; CHECK-SAME: i32 [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]], ptr [[ARG3:%.*]], ptr [[ARG4:%.*]], ptr [[ARG5:%.*]], i8 [[ARG6:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[ARG]], 0
+; CHECK-NEXT:    br i1 [[ICMP]], label %[[BB7:.*]], label %[[BB25:.*]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[ARG]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[ARG5]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[ZEXT]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; CHECK-NEXT:    [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
+; CHECK-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
+; CHECK-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
+; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX27]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[ARG6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP4]], align 1, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge <vscale x 16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META3:![0-9]+]], !noalias [[META5:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD28:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP8]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD28]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i8> [[TMP9]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP10]], ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META3]], !noalias [[META5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META11:![0-9]+]], !noalias [[META12:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD28]], [[WIDE_MASKED_LOAD28]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD30]], [[TMP12]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META11]], !noalias [[META12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[BB24:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[BB7]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[BB8:.*]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD22:%.*]], %[[BB21:.*]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GETELEMENTPTR]], align 1
+; CHECK-NEXT:    [[ICMP9:%.*]] = icmp ult i8 [[LOAD]], [[ARG6]]
+; CHECK-NEXT:    br i1 [[ICMP9]], label %[[BB21]], label %[[BB10:.*]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[GETELEMENTPTR11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD12:%.*]] = load i8, ptr [[GETELEMENTPTR11]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG3]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD14:%.*]] = load i8, ptr [[GETELEMENTPTR13]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG4]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD16:%.*]] = load i8, ptr [[GETELEMENTPTR15]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[LOAD16]], [[LOAD14]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[LOAD12]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[GETELEMENTPTR11]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR17:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG2]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD18:%.*]] = load i8, ptr [[GETELEMENTPTR17]], align 1
+; CHECK-NEXT:    [[MUL19:%.*]] = mul i8 [[LOAD14]], [[LOAD14]]
+; CHECK-NEXT:    [[ADD20:%.*]] = add i8 [[LOAD18]], [[MUL19]]
+; CHECK-NEXT:    store i8 [[ADD20]], ptr [[GETELEMENTPTR17]], align 1
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[ADD22]] = add nuw nsw i64 [[PHI]], 1
+; CHECK-NEXT:    [[ICMP23:%.*]] = icmp eq i64 [[ADD22]], [[ZEXT]]
+; CHECK-NEXT:    br i1 [[ICMP23]], label %[[BB24]], label %[[BB8]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br label %[[BB25]]
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp sgt i32 %arg, 0
+  br i1 %icmp, label %bb7, label %bb25
+
+bb7:                                              ; preds = %bb
+  %zext = zext nneg i32 %arg to i64
+  br label %bb8
+
+bb8:                                              ; preds = %bb21, %bb7
+  %phi = phi i64 [ 0, %bb7 ], [ %add22, %bb21 ]
+  %getelementptr = getelementptr inbounds nuw i8, ptr %arg5, i64 %phi
+  %load = load i8, ptr %getelementptr, align 1
+  %icmp9 = icmp ult i8 %load, %arg6
+  br i1 %icmp9, label %bb21, label %bb10
+
+bb10:                                             ; preds = %bb8
+  %getelementptr11 = getelementptr inbounds nuw i8, ptr %arg1, i64 %phi
+  %load12 = load i8, ptr %getelementptr11, align 1
+  %getelementptr13 = getelementptr inbounds nuw i8, ptr %arg3, i64 %phi
+  %load14 = load i8, ptr %getelementptr13, align 1
+  %getelementptr15 = getelementptr inbounds nuw i8, ptr %arg4, i64 %phi
+  %load16 = load i8, ptr %getelementptr15, align 1
+  %mul = mul i8 %load16, %load14
+  %add = add i8 %mul, %load12
+  store i8 %add, ptr %getelementptr11, align 1
+  %getelementptr17 = getelementptr inbounds nuw i8, ptr %arg2, i64 %phi
+  %load18 = load i8, ptr %getelementptr17, align 1
+  %mul19 = mul i8 %load14, %load14
+  %add20 = add i8 %load18, %mul19
+  store i8 %add20, ptr %getelementptr17, align 1
+  br label %bb21
+
+bb21:                                             ; preds = %bb10, %bb8
+  %add22 = add nuw nsw i64 %phi, 1
+  %icmp23 = icmp eq i64 %add22, %zext
+  br i1 %icmp23, label %bb24, label %bb8, !llvm.loop !0
+
+bb24:                                             ; preds = %bb21
+  br label %bb25
+
+bb25:                                             ; preds = %bb24, %bb
+  ret void
+}
+
+attributes #0 = { uwtable vscale_range(1,16) "aarch64_pstate_sm_body" "target-features"="+fp-armv8,+neon,+sme,+v8a,-fmv" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 16}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[META5]] = !{[[META6:![0-9]+]], [[META1]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]], [[META2]]}
+; CHECK: [[META7]] = distinct !{[[META7]], [[META2]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META2]]}
+; CHECK: [[META9]] = !{[[META7]]}
+; CHECK: [[META10]] = !{[[META8]]}
+; CHECK: [[META11]] = !{[[META6]]}
+; CHECK: [[META12]] = !{[[META1]], [[META7]], [[META8]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
+; CHECK: [[META14]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META15]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index cdddcc9..68cfc65 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
-; RUN: opt -p loop-vectorize -max-interleave-group-factor=4 -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
 
 target triple = "arm64-apple-macosx15.0.0"
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index 54b7f2a..f2ae327 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -925,20 +925,20 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef
 ; CHECK-SAME: ptr noalias noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef readonly captures(none) [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <32 x float>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <32 x float> [[WIDE_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <32 x float> [[WIDE_VEC19]], [[TMP4]]
-; CHECK-NEXT:    store <32 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]]
+; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144
 ; CHECK-NEXT:    br i1 [[TMP25]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[FOR_END11]]:
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll
new file mode 100644
index 0000000..921bcf0
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/sve-interleave-splat.ll
@@ -0,0 +1,11 @@
+; RUN: opt -passes=vector-combine %s -S -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 4 x i16> @interleave2_same_const_splat_nxv4i16() {
+;CHECK-LABEL: @interleave2_same_const_splat_nxv4i16(
+;CHECK: call <vscale x 4 x i16> @llvm.vector.interleave2
+;CHECK: ret <vscale x 4 x i16> %retval
+  %retval = call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> splat(i16 3), <vscale x 2 x i16> splat(i16 3))
+  ret <vscale x 4 x i16> %retval
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
new file mode 100644
index 0000000..2926371
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
new file mode 100644
index 0000000..88cb03e
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+; ASM-LABEL: test1:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movq %rdi, %rax
+; ASM-NEXT:    addq -{{[0-9]+}}(%rsp), %rax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test1
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $rdi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+; MIR-NEXT:   [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc)
+; MIR-NEXT:   $rax = COPY [[ADD64rm]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+; ASM-LABEL: test2:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movl %edi, %eax
+; ASM-NEXT:    addl -{{[0-9]+}}(%rsp), %eax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test2
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $edi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc)
+; MIR-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit
+; MIR-NEXT:   $rax = COPY [[SUBREG_TO_REG]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
new file mode 100644
index 0000000..7167bcf
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
new file mode 100644
index 0000000..1ba920d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
new file mode 100644
index 0000000..6fc57b5
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
@@ -0,0 +1,9 @@
+# REQUIRES: x86-registered-target
+## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
+
+## Verify that running the script again on an already updated file doesn't add duplicate checks
+# RUN: %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
new file mode 100644
index 0000000..0f8aaa54
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
@@ -0,0 +1,8 @@
+# REQUIRES: x86-registered-target
+## Test that using the same prefix for both ASM and MIR outputs generates a warning
+## and doesn't produce any checks.
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING
+# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll
+
+# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 59a9ea1..c1791dfa 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1132,8 +1132,7 @@ TEST_F(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
       new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
   VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
   VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, Load->getAlign(), {},
-                           {});
+  VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1250,8 +1249,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         new LoadInst(Int32, PoisonValue::get(Int32Ptr), "", false, Align(1));
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
-    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, Load->getAlign(),
-                             {}, {});
+    VPWidenLoadRecipe Recipe(*Load, Addr, Mask, true, false, {}, {});
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1265,8 +1263,8 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
     VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2));
     VPValue *StoredV = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 3));
-    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false,
-                              Store->getAlign(), {}, {});
+    VPWidenStoreRecipe Recipe(*Store, Addr, StoredV, Mask, false, false, {},
+                              {});
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 2dad16a..baa0377 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -605,6 +605,7 @@ TRIPLE_IR_RE = re.compile(r'^\s*target\s+triple\s*=\s*"([^"]+)"$')
 TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)")
 MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
+STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
 IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+")
diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py
index 24bb8b3..01ee0e1 100644
--- a/llvm/utils/UpdateTestChecks/mir.py
+++ b/llvm/utils/UpdateTestChecks/mir.py
@@ -163,13 +163,15 @@ def add_mir_checks_for_function(
     print_fixed_stack,
     first_check_is_next,
     at_the_function_name,
+    check_indent=None,
 ):
     printed_prefixes = set()
     for run in run_list:
         for prefix in run[0]:
             if prefix in printed_prefixes:
                 break
-            if not func_dict[prefix][func_name]:
+            # func_info can be empty if there was a prefix conflict.
+            if not func_dict[prefix].get(func_name):
                 continue
             if printed_prefixes:
                 # Add some space between different check prefixes.
@@ -185,6 +187,7 @@ def add_mir_checks_for_function(
                 func_dict[prefix][func_name],
                 print_fixed_stack,
                 first_check_is_next,
+                check_indent,
             )
             break
         else:
@@ -204,6 +207,7 @@ def add_mir_check_lines(
     func_info,
     print_fixed_stack,
     first_check_is_next,
+    check_indent=None,
 ):
     func_body = str(func_info).splitlines()
     if single_bb:
@@ -220,7 +224,10 @@ def add_mir_check_lines(
     first_line = func_body[0]
     indent = len(first_line) - len(first_line.lstrip(" "))
     # A check comment, indented the appropriate amount
-    check = "{:>{}}; {}".format("", indent, prefix)
+    if check_indent is not None:
+        check = "{}; {}".format(check_indent, prefix)
+    else:
+        check = "{:>{}}; {}".format("", indent, prefix)
 
     output_lines.append("{}-LABEL: name: {}".format(check, func_name))
 
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 3c3fdf7..9c64db5 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -38,6 +38,7 @@ static_library("bugprone") {
     "EasilySwappableParametersCheck.cpp",
     "EmptyCatchCheck.cpp",
     "ExceptionEscapeCheck.cpp",
+    "FloatLoopCounterCheck.cpp",
     "FoldInitTypeCheck.cpp",
     "ForwardDeclarationNamespaceCheck.cpp",
     "ForwardingReferenceOverloadCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index 1eae289..16f914a 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -16,7 +16,6 @@ static_library("cert") {
   ]
   sources = [
     "CERTTidyModule.cpp",
-    "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",
     "MutatingCopyCheck.cpp",
     "ProperlySeededRandomGeneratorCheck.cpp",
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index cd9512f..b1f20a7 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1,8 +1,3 @@
-Analysis/LoopAccessAnalysis/memcheck-ni.ll
-Analysis/MemorySSA/pr116227.ll
-Analysis/MemorySSA/pr43641.ll
-Analysis/MemorySSA/pr46574.ll
-Analysis/MemorySSA/update-remove-dead-blocks.ll
 Bitcode/fcmp-fast.ll
 Bitcode/flags.ll
 CodeGen/AArch64/cgdata-merge-local.ll
@@ -26,27 +21,12 @@ CodeGen/X86/nocfivalue.ll
 DebugInfo/AArch64/ir-outliner.ll
 DebugInfo/assignment-tracking/X86/hotcoldsplit.ll
 DebugInfo/Generic/block-asan.ll
-DebugInfo/KeyInstructions/Generic/loop-unswitch.ll
 DebugInfo/X86/asan_debug_info.ll
 LTO/X86/diagnostic-handler-remarks-with-hotness.ll
 Other/optimization-remarks-auto.ll
 Other/X86/debugcounter-partiallyinlinelibcalls.ll
-Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
-Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
-Transforms/AtomicExpand/AArch64/pcsections.ll
 Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
-Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll
-Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
-Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
-Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
-Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
-Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
-Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
-Transforms/AtomicExpand/SPARC/libcalls.ll
 Transforms/AtomicExpand/SPARC/partword.ll
-Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
-Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
-Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
 Transforms/Attributor/align.ll
 Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
 Transforms/Attributor/ArgumentPromotion/2008-07-02-array-indexing.ll
@@ -265,14 +245,13 @@ Transforms/InstCombine/and2.ll
 Transforms/InstCombine/and-fcmp.ll
 Transforms/InstCombine/and.ll
 Transforms/InstCombine/and-or-icmps.ll
-Transforms/InstCombine/and-or-implied-cond-not.ll
 Transforms/InstCombine/apint-div1.ll
 Transforms/InstCombine/apint-div2.ll
 Transforms/InstCombine/ashr-demand.ll
 Transforms/InstCombine/atomic.ll
 Transforms/InstCombine/binop-cast.ll
-Transforms/InstCombine/binop-select.ll
 Transforms/InstCombine/binop-select-cast-of-select-cond.ll
+Transforms/InstCombine/binop-select.ll
 Transforms/InstCombine/bit-checks.ll
 Transforms/InstCombine/bitreverse.ll
 Transforms/InstCombine/branch.ll
@@ -298,7 +277,6 @@ Transforms/InstCombine/fold-ctpop-of-not.ll
 Transforms/InstCombine/fold-ext-eq-c-with-op.ll
 Transforms/InstCombine/free-inversion.ll
 Transforms/InstCombine/icmp-and-lowbit-mask.ll
-Transforms/InstCombine/icmp-equality-test.ll
 Transforms/InstCombine/icmp.ll
 Transforms/InstCombine/icmp-mul-and.ll
 Transforms/InstCombine/icmp-of-and-x.ll
@@ -307,7 +285,6 @@ Transforms/InstCombine/icmp-select-implies-common-op.ll
 Transforms/InstCombine/icmp-select.ll
 Transforms/InstCombine/icmp-with-selects.ll
 Transforms/InstCombine/intrinsic-select.ll
-Transforms/InstCombine/known-never-nan.ll
 Transforms/InstCombine/ldexp-ext.ll
 Transforms/InstCombine/ldexp.ll
 Transforms/InstCombine/load-bitcast-select.ll
@@ -347,13 +324,11 @@ Transforms/InstCombine/or.ll
 Transforms/InstCombine/pow-1.ll
 Transforms/InstCombine/pow-3.ll
 Transforms/InstCombine/pow-sqrt.ll
-Transforms/InstCombine/pr24354.ll
 Transforms/InstCombine/pull-conditional-binop-through-shift.ll
 Transforms/InstCombine/rem.ll
 Transforms/InstCombine/sdiv-canonicalize.ll
 Transforms/InstCombine/sdiv-guard.ll
 Transforms/InstCombine/select-and-or.ll
-Transforms/InstCombine/select-bitext.ll
 Transforms/InstCombine/select-cmp-br.ll
 Transforms/InstCombine/select-cmp.ll
 Transforms/InstCombine/select-factorize.ll
@@ -362,7 +337,6 @@ Transforms/InstCombine/select.ll
 Transforms/InstCombine/select-min-max.ll
 Transforms/InstCombine/select-of-symmetric-selects.ll
 Transforms/InstCombine/select-select.ll
-Transforms/InstCombine/select-with-extreme-eq-cond.ll
 Transforms/InstCombine/shift.ll
 Transforms/InstCombine/shuffle-select-narrow-inseltpoison.ll
 Transforms/InstCombine/shuffle-select-narrow.ll
@@ -512,66 +486,12 @@ Transforms/LoopBoundSplit/bug51866.ll
 Transforms/LoopBoundSplit/bug-loop-bound-split-phi-in-exit-block.ll
 Transforms/LoopBoundSplit/loop-bound-split.ll
 Transforms/LoopDeletion/invalidate-scev-after-hoisting.ll
-Transforms/LoopDistribute/basic-with-memchecks.ll
-Transforms/LoopDistribute/bounds-expansion-bug.ll
-Transforms/LoopDistribute/cross-partition-access.ll
-Transforms/LoopDistribute/debug-loc.ll
-Transforms/LoopDistribute/debug-print.ll
-Transforms/LoopDistribute/diagnostics.ll
-Transforms/LoopDistribute/followup.ll
-Transforms/LoopDistribute/laa-invalidation.ll
-Transforms/LoopDistribute/outside-use.ll
-Transforms/LoopDistribute/pointer-phi-in-loop.ll
-Transforms/LoopDistribute/scev-inserted-runtime-check.ll
-Transforms/LoopDistribute/symbolic-stride.ll
-Transforms/LoopFlatten/loop-flatten-version.ll
 Transforms/LoopIdiom/AArch64/byte-compare-index.ll
 Transforms/LoopIdiom/AArch64/find-first-byte.ll
 Transforms/LoopIdiom/RISCV/byte-compare-index.ll
-Transforms/LoopIdiom/X86/arithmetic-right-shift-until-zero.ll
-Transforms/LoopIdiom/X86/left-shift-until-bittest.ll
-Transforms/LoopIdiom/X86/left-shift-until-zero.ll
-Transforms/LoopIdiom/X86/logical-right-shift-until-zero-debuginfo.ll
-Transforms/LoopIdiom/X86/logical-right-shift-until-zero.ll
-Transforms/LoopLoadElim/forward.ll
-Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
-Transforms/LoopLoadElim/memcheck.ll
-Transforms/LoopLoadElim/pr47457.ll
-Transforms/LoopLoadElim/symbolic-stride.ll
-Transforms/LoopLoadElim/unknown-stride-known-dep.ll
-Transforms/LoopLoadElim/versioning-scev-invalidation.ll
-Transforms/LoopPredication/preserve-bpi.ll
-Transforms/LoopSimplifyCFG/constant-fold-branch.ll
-Transforms/LoopSimplifyCFG/handle_dead_exits.ll
-Transforms/LoopSimplifyCFG/invalidate-scev-dispositions-2.ll
-Transforms/LoopSimplifyCFG/invalidate-scev-dispositions.ll
-Transforms/LoopSimplifyCFG/lcssa.ll
-Transforms/LoopSimplifyCFG/live_block_marking.ll
-Transforms/LoopSimplifyCFG/mssa_update.ll
-Transforms/LoopSimplifyCFG/pr117537.ll
-Transforms/LoopSimplifyCFG/update_parents.ll
 Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll
 Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
 Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
-Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
-Transforms/LoopVersioning/add-phi-update-users.ll
-Transforms/LoopVersioning/basic.ll
-Transforms/LoopVersioning/bound-check-partially-known.ll
-Transforms/LoopVersioning/crash-36998.ll
-Transforms/LoopVersioning/exit-block-dominates-rt-check-block.ll
-Transforms/LoopVersioning/incorrect-phi.ll
-Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
-Transforms/LoopVersioning/lcssa.ll
-Transforms/LoopVersioningLICM/load-from-unknown-address.ll
-Transforms/LoopVersioningLICM/loopversioningLICM1.ll
-Transforms/LoopVersioningLICM/loopversioningLICM2.ll
-Transforms/LoopVersioningLICM/metadata.ll
-Transforms/LoopVersioning/loop-invariant-bound.ll
-Transforms/LoopVersioning/noalias.ll
-Transforms/LoopVersioning/noalias-version-twice.ll
-Transforms/LoopVersioning/single-iteration.ll
-Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
-Transforms/LoopVersioning/wrapping-pointer-versioning.ll
 Transforms/LowerAtomic/atomic-load.ll
 Transforms/LowerAtomic/atomic-swap.ll
 Transforms/LowerConstantIntrinsics/builtin-object-size-phi.ll
@@ -740,27 +660,6 @@ Transforms/Scalarizer/scatter-order.ll
 Transforms/Scalarizer/variable-extractelement.ll
 Transforms/Scalarizer/variable-insertelement.ll
 Transforms/Scalarizer/vector-of-pointer-to-vector.ll
-Transforms/SimpleLoopUnswitch/debuginfo.ll
-Transforms/SimpleLoopUnswitch/delete-dead-blocks.ll
-Transforms/SimpleLoopUnswitch/endless-unswitch.ll
-Transforms/SimpleLoopUnswitch/guards.ll
-Transforms/SimpleLoopUnswitch/inject-invariant-conditions-exponential.ll
-Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
-Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll
-Transforms/SimpleLoopUnswitch/nontrivial-unswitch-skip-selects-in-guards.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-mssa-threshold.ll
-Transforms/SimpleLoopUnswitch/partial-unswitch-update-memoryssa.ll
-Transforms/SimpleLoopUnswitch/pr138509.ll
-Transforms/SimpleLoopUnswitch/pr59546.ll
-Transforms/SimpleLoopUnswitch/pr60736.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
-Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 Transforms/StructurizeCFG/callbr.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll
diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py
index 8c57e75..98864be 100755
--- a/llvm/utils/update_llc_test_checks.py
+++ b/llvm/utils/update_llc_test_checks.py
@@ -15,7 +15,7 @@ import argparse
 import os  # Used to advertise this file's name ("autogenerated_note").
 import sys
 
-from UpdateTestChecks import common
+from UpdateTestChecks import common, mir
 
 # llc is the only llc-like in the LLVM tree but downstream forks can add
 # additional ones here if they have them.
@@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo):
             break
 
     run_list = []
+    mir_run_list = []
     for l in ti.run_lines:
         if "|" not in l:
             common.warn("Skipping unparsable RUN line: " + l)
@@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo):
         if m:
             march_in_cmd = m.groups()[0]
 
+        target_list = run_list
         m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd)
         if m and m.groups()[0] == "isel":
             from UpdateTestChecks import isel as output_type
+        elif not m and common.STOP_PASS_RE.search(llc_cmd):
+            # MIR output mode. If -debug-only is present assume
+            # the debug output is the main point of interest.
+            target_list = mir_run_list
         else:
             from UpdateTestChecks import asm as output_type
 
@@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo):
 
         # FIXME: We should use multiple check prefixes to common check lines. For
         # now, we just ignore all but the last.
-        run_list.append(
+        target_list.append(
             (
                 check_prefixes,
                 llc_tool,
@@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo):
         ginfo=ginfo,
     )
 
-    for (
-        prefixes,
-        llc_tool,
-        llc_args,
-        preprocess_cmd,
-        triple_in_cmd,
-        march_in_cmd,
-    ) in run_list:
+    # Dictionary to store MIR function bodies separately
+    mir_func_dict = {}
+    for run_tuple, is_mir in [(run, False) for run in run_list] + [
+        (run, True) for run in mir_run_list
+    ]:
+        (
+            prefixes,
+            llc_tool,
+            llc_args,
+            preprocess_cmd,
+            triple_in_cmd,
+            march_in_cmd,
+        ) = run_tuple
+
         common.debug("Extracted LLC cmd:", llc_tool, llc_args)
         common.debug("Extracted FileCheck prefixes:", str(prefixes))
 
@@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo):
         if not triple:
             triple = common.get_triple_from_march(march_in_cmd)
 
-        scrubber, function_re = output_type.get_run_handler(triple)
-        if 0 == builder.process_run_line(
-            function_re, scrubber, raw_tool_output, prefixes
-        ):
-            common.warn(
-                "Couldn't match any function. Possibly the wrong target triple has been provided"
+        if is_mir:
+            # MIR output mode
+            common.debug("Detected MIR output mode for prefixes:", str(prefixes))
+            for prefix in prefixes:
+                if prefix not in mir_func_dict:
+                    mir_func_dict[prefix] = {}
+
+            mir.build_function_info_dictionary(
+                ti.path,
+                raw_tool_output,
+                triple,
+                prefixes,
+                mir_func_dict,
+                ti.args.verbose,
             )
-        builder.processed_prefixes(prefixes)
+        else:
+            # ASM output mode
+            scrubber, function_re = output_type.get_run_handler(triple)
+            if 0 == builder.process_run_line(
+                function_re, scrubber, raw_tool_output, prefixes
+            ):
+                common.warn(
+                    "Couldn't match any function. Possibly the wrong target triple has been provided"
+                )
+            builder.processed_prefixes(prefixes)
 
     func_dict = builder.finish_and_get_func_dict()
+
+    # Check for conflicts: same prefix used for both ASM and MIR
+    conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys())
+    if conflicting_prefixes:
+        common.warn(
+            "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format(
+                ", ".join(sorted(conflicting_prefixes))
+            ),
+            test_file=ti.path,
+        )
+        for prefix in conflicting_prefixes:
+            mir_func_dict[prefix] = {}
+            func_dict[prefix] = {}
+
     global_vars_seen_dict = {}
 
     is_in_function = False
     is_in_function_start = False
     func_name = None
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
+    prefix_set.update([prefix for p in mir_run_list for prefix in p[0]])
     common.debug("Rewriting FileCheck prefixes:", str(prefix_set))
     output_lines = []
 
@@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo):
                         is_filtered=builder.is_filtered(),
                     )
                 )
+
+                # Also add MIR checks if we have them for this function
+                if mir_run_list and func_name:
+                    mir.add_mir_checks_for_function(
+                        ti.path,
+                        output_lines,
+                        mir_run_list,
+                        mir_func_dict,
+                        func_name,
+                        single_bb=False,  # Don't skip basic block labels.
+                        print_fixed_stack=False,  # Don't print fixed stack (ASM tests don't need it).
+                        first_check_is_next=False,  # First check is LABEL, not NEXT.
+                        at_the_function_name=False,  # Use "name:" not "@name".
+                        check_indent="",  # No indentation for IR files (not MIR files).
+                    )
+
                 is_in_function_start = False
 
             if is_in_function: